1 #!/usr/bin/env python
3 """Create full-text indexes and search them
5 Notes:
7 See http://gnosis.cx/publish/programming/charming_python_15.txt
8 for a detailed discussion of this module.
10 This version requires Python 1.6+. It turns out that the use
11 of string methods rather than [string] module functions is
12 enough faster in a tight loop so as to provide a quite
13 remarkable 25% speedup in overall indexing. However, only FOUR
14 lines in TextSplitter.text_splitter() were changed away from
15 Python 1.5 compatibility. Those lines are followed by comments
16 beginning with "# 1.52: " that show the old forms. Python
17 1.5 users can restore these lines, and comment out those just
18 above them.
20 Classes:
22 GenericIndexer -- Abstract class
23 TextSplitter -- Mixin class
24 Index
25 ShelveIndexer
26 FlatIndexer
27 XMLPickleIndexer
28 PickleIndexer
29 ZPickleIndexer
30 SlicedZPickleIndexer
32 Functions:
34 echo_fname(fname)
35 recurse_files(...)
37 Index Formats:
39 *Indexer.files: filename --> (fileid, wordcount)
40 *Indexer.fileids: fileid --> filename
41 *Indexer.words: word --> {fileid1:occurs, fileid2:occurs, ...}
43 Module Usage:
45 There are a few ways to use this module. Just to utilize existing
46 functionality, something like the following is a likely
47 pattern:
49 import gnosis.indexer as indexer
50 index = indexer.MyFavoriteIndexer() # For some concrete Indexer
51 index.load_index('myIndex.db')
52 index.add_files(dir='/this/that/otherdir', pattern='*.txt')
53 hits = index.find(['spam','eggs','bacon'])
54 index.print_report(hits)
56 To customize the basic classes, something like the following is likely:
58 class MySplitter:
59 def splitter(self, text, ftype):
60 "Peform much better splitting than default (for filetypes)"
61 # ...
62 return words
64 class MyIndexer(indexer.GenericIndexer, MySplitter):
65 def load_index(self, INDEXDB=None):
66 "Retrieve three dictionaries from clever storage method"
67 # ...
68 self.words, self.files, self.fileids = WORDS, FILES, FILEIDS
69 def save_index(self, INDEXDB=None):
70 "Save three dictionaries to clever storage method"
72 index = MyIndexer()
73 # ...etc...
75 Benchmarks:
77 As we know, there are lies, damn lies, and benchmarks. Take
78 the below with an adequate dose of salt. In version 0.10 of
79 the concrete indexers, some performance was tested. The
80 test case was a set of mail/news archives, that were about
81 43 mB, and 225 files. In each case, an index was generated
82 (if possible), and a search for the words "xml python" was
83 performed.
85 - Index w/ PickleIndexer: 482s, 2.4 mB
86 - Search w/ PickleIndexer: 1.74s
87 - Index w/ ZPickleIndexer: 484s, 1.2 mB
88 - Search w/ ZPickleIndexer: 1.77s
89 - Index w/ FlatIndexer: 492s, 2.6 mB
90 - Search w/ FlatIndexer: 53s
91 - Index w/ ShelveIndexer: (dumbdbm) Many minutes, tens of mBs
92 - Search w/ ShelveIndexer: Aborted before completely indexed
93 - Index w/ ShelveIndexer: (dbhash) Long time (partial crash), 10 mB
94 - Search w/ ShelveIndexer: N/A. Too many glitches
95 - Index w/ XMLPickleIndexer: Memory error (xml_pickle uses bad string
96 composition for large output)
97 - Search w/ XMLPickleIndexer: N/A
98 - grep search (xml|python): 20s (cached: <5s)
99 - 'srch' utility (python): 12s
100 """
101 #$Id: indexer.py,v 1.2 2002-05-25 07:16:24 rochecompaan Exp $
103 __shell_usage__ = """
104 Shell Usage: [python] indexer.py [options] [search_words]
106 -h, /h, -?, /?, ?, --help: Show this help screen
107 -index: Add files to index
108 -reindex: Refresh files already in the index
109 (can take much more time)
110 -casesensitive: Maintain the case of indexed words
111 (can lead to MUCH larger indices)
112 -norecurse, -local: Only index starting dir, not subdirs
113 -dir=<directory>: Starting directory for indexing
114 (default is current directory)
115 -indexdb=<database>: Use specified index database
116 (environ variable INDEXER_DB is preferred)
117 -regex=<pattern>: Index files matching regular expression
118 -glob=<pattern>: Index files matching glob pattern
119 -filter=<pattern> Only display results matching pattern
120 -output=<op>, -format=<opt>: How much detail on matches?
121 -<digit>: Quiet level (0=verbose ... 9=quiet)
123 Output/format options are ALL/EVERYTHING/VERBOSE, RATINGS/SCORES,
124 FILENAMES/NAMES/FILES, SUMMARY/REPORT"""
126 __version__ = "$Revision: 1.2 $"
127 __author__=["David Mertz (mertz@gnosis.cx)",]
128 __thanks_to__=["Pat Knight (p.knight@ktgroup.co.uk)",
129 "Gregory Popovitch (greg@gpy.com)", ]
130 __copyright__="""
131 This file is released to the public domain. I (dqm) would
132 appreciate it if you choose to keep derived works under terms
133 that promote freedom, but obviously am giving up any rights
134 to compel such.
135 """
137 __history__="""
138 0.1 Initial version.
140 0.11 Tweaked TextSplitter after some random experimentation.
142 0.12 Added SlicedZPickleIndexer (best choice, so far).
144 0.13 Pat Knight pointed out need for binary open()'s of
145 certain files under Windows.
147 0.14 Added '-filter' switch to search results.
149 0.15 Added direct read of gzip files
151 0.20 Gregory Popovitch did some profiling on TextSplitter,
152 and provided both huge speedups to the Python version
153 and hooks to a C extension class (ZopeTextSplitter).
154 A little refactoring by he and I (dqm) has nearly
155 doubled the speed of indexing
157 0.30 Module refactored into gnosis package. This is a
158 first pass, and various documentation and test cases
159 should be added later.
160 """
161 import string, re, os, fnmatch, sys, copy, gzip
162 from types import *
164 #-- Silly "do nothing" default recursive file processor
165 def echo_fname(fname): print fname
167 #-- "Recurse and process files" utility function
168 def recurse_files(curdir, pattern, exclusions, func=echo_fname, *args, **kw):
169 "Recursively process file pattern"
170 subdirs, files = [],[]
171 level = kw.get('level',0)
173 for name in os.listdir(curdir):
174 fname = os.path.join(curdir, name)
175 if name[-4:] in exclusions:
176 pass # do not include binary file type
177 elif os.path.isdir(fname) and not os.path.islink(fname):
178 subdirs.append(fname)
179 # kludge to detect a regular expression across python versions
180 elif sys.version[0]=='1' and isinstance(pattern, re.RegexObject):
181 if pattern.match(name):
182 files.append(fname)
183 elif sys.version[0]=='2' and type(pattern)==type(re.compile('')):
184 if pattern.match(name):
185 files.append(fname)
186 elif type(pattern) is StringType:
187 if fnmatch.fnmatch(name, pattern):
188 files.append(fname)
190 for fname in files:
191 apply(func, (fname,)+args)
192 for subdir in subdirs:
193 recurse_files(subdir, pattern, exclusions, func, level=level+1)
195 #-- Data bundle for index dictionaries
196 class Index:
197 def __init__(self, words, files, fileids):
198 if words is not None: self.WORDS = words
199 if files is not None: self.FILES = files
200 if fileids is not None: self.FILEIDS = fileids
202 #-- "Split plain text into words" utility function
203 class TextSplitter:
204 def initSplitter(self):
205 prenum = string.join(map(chr, range(0,48)), '')
206 num2cap = string.join(map(chr, range(58,65)), '')
207 cap2low = string.join(map(chr, range(91,97)), '')
208 postlow = string.join(map(chr, range(123,256)), '')
209 nonword = prenum + num2cap + cap2low + postlow
210 self.word_only = string.maketrans(nonword, " "*len(nonword))
211 self.nondigits = string.join(map(chr, range(0,48)) + map(chr, range(58,255)), '')
212 self.alpha = string.join(map(chr, range(65,91)) + map(chr, range(97,123)), '')
213 self.ident = string.join(map(chr, range(256)), '')
214 self.init = 1
216 def splitter(self, text, ftype):
217 "Split the contents of a text string into a list of 'words'"
218 if ftype == 'text/plain':
219 words = self.text_splitter(text, self.casesensitive)
220 else:
221 raise NotImplementedError
222 return words
224 def text_splitter(self, text, casesensitive=0):
225 """Split text/plain string into a list of words
227 In version 0.20 this function is still fairly weak at
228 identifying "real" words, and excluding gibberish
229 strings. As long as the indexer looks at "real" text
230 files, it does pretty well; but if indexing of binary
231 data is attempted, a lot of gibberish gets indexed.
232 Suggestions on improving this are GREATLY APPRECIATED.
233 """
234 # Initialize some constants
235 if not hasattr(self,'init'): self.initSplitter()
237 # Speedup trick: attributes into local scope
238 word_only = self.word_only
239 ident = self.ident
240 alpha = self.alpha
241 nondigits = self.nondigits
242 translate = string.translate
244 # Let's adjust case if not case-sensitive
245 if not casesensitive: text = string.upper(text)
247 # Split the raw text
248 allwords = string.split(text)
250 # Finally, let's skip some words not worth indexing
251 words = []
252 for word in allwords:
253 if len(word) > 25: continue # too long (probably gibberish)
255 # Identify common patterns in non-word data (binary, UU/MIME, etc)
256 num_nonalpha = len(word.translate(ident, alpha))
257 numdigits = len(word.translate(ident, nondigits))
258 # 1.52: num_nonalpha = len(translate(word, ident, alpha))
259 # 1.52: numdigits = len(translate(word, ident, nondigits))
260 if numdigits > len(word)-2: # almost all digits
261 if numdigits > 5: # too many digits is gibberish
262 continue # a moderate number is year/zipcode/etc
263 elif num_nonalpha*3 > len(word): # too much scattered nonalpha = gibberish
264 continue
266 word = word.translate(word_only) # Let's strip funny byte values
267 # 1.52: word = translate(word, word_only)
268 subwords = word.split() # maybe embedded non-alphanumeric
269 # 1.52: subwords = string.split(word)
270 for subword in subwords: # ...so we might have subwords
271 if len(subword) <= 2: continue # too short a subword
272 words.append(subword)
273 return words
275 class ZopeTextSplitter:
276 def initSplitter(self):
277 import Splitter
278 stop_words=(
279 'am', 'ii', 'iii', 'per', 'po', 're', 'a', 'about', 'above', 'across',
280 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone',
281 'along', 'already', 'also', 'although', 'always', 'am', 'among',
282 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any',
283 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around',
284 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes',
285 'becoming', 'been', 'before', 'beforehand', 'behind', 'being',
286 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both',
287 'bottom', 'but', 'by', 'can', 'cannot', 'cant', 'con', 'could',
288 'couldnt', 'cry', 'describe', 'detail', 'do', 'done', 'down', 'due',
289 'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else',
290 'elsewhere', 'empty', 'enough', 'even', 'ever', 'every', 'everyone',
291 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty',
292 'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly',
293 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get',
294 'give', 'go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her',
295 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers',
296 'herself', 'him', 'himself', 'his', 'how', 'however', 'hundred', 'i',
297 'ie', 'if', 'in', 'inc', 'indeed', 'interest', 'into', 'is', 'it',
298 'its', 'itself', 'keep', 'last', 'latter', 'latterly', 'least',
299 'less', 'made', 'many', 'may', 'me', 'meanwhile', 'might', 'mill',
300 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must',
301 'my', 'myself', 'name', 'namely', 'neither', 'never', 'nevertheless',
302 'next', 'nine', 'no', 'nobody', 'none', 'noone', 'nor', 'not',
303 'nothing', 'now', 'nowhere', 'of', 'off', 'often', 'on', 'once',
304 'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'our',
305 'ours', 'ourselves', 'out', 'over', 'own', 'per', 'perhaps',
306 'please', 'pre', 'put', 'rather', 're', 'same', 'see', 'seem',
307 'seemed', 'seeming', 'seems', 'serious', 'several', 'she', 'should',
308 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some',
309 'somehow', 'someone', 'something', 'sometime', 'sometimes',
310 'somewhere', 'still', 'such', 'take', 'ten', 'than', 'that', 'the',
311 'their', 'them', 'themselves', 'then', 'thence', 'there',
312 'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these',
313 'they', 'thick', 'thin', 'third', 'this', 'those', 'though', 'three',
314 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too',
315 'toward', 'towards', 'twelve', 'twenty', 'two', 'un', 'under',
316 'until', 'up', 'upon', 'us', 'very', 'via', 'was', 'we', 'well',
317 'were', 'what', 'whatever', 'when', 'whence', 'whenever', 'where',
318 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon',
319 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever',
320 'whole', 'whom', 'whose', 'why', 'will', 'with', 'within', 'without',
321 'would', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves',
322 )
323 self.stop_word_dict={}
324 for word in stop_words: self.stop_word_dict[word]=None
325 self.splitterobj = Splitter.getSplitter()
326 self.init = 1
328 def goodword(self, word):
329 return len(word) < 25
331 def splitter(self, text, ftype):
332 """never case-sensitive"""
333 if not hasattr(self,'init'): self.initSplitter()
334 return filter(self.goodword, self.splitterobj(text, self.stop_word_dict))
337 #-- "Abstract" parent class for inherited indexers
338 # (does not handle storage in parent, other methods are primitive)
340 class GenericIndexer:
341 def __init__(self, **kw):
342 apply(self.configure, (), kw)
344 def whoami(self):
345 return self.__class__.__name__
347 def configure(self, REINDEX=0, CASESENSITIVE=0,
348 INDEXDB=os.environ.get('INDEXER_DB', 'TEMP_NDX.DB'),
349 ADD_PATTERN='*', QUIET=5):
350 "Configure settings used by indexing and storage/retrieval"
351 self.indexdb = INDEXDB
352 self.reindex = REINDEX
353 self.casesensitive = CASESENSITIVE
354 self.add_pattern = ADD_PATTERN
355 self.quiet = QUIET
356 self.filter = None
358 def add_files(self, dir=os.getcwd(), pattern=None, descend=1):
359 self.load_index()
360 exclusions = ('.zip','.pyc','.gif','.jpg','.dat','.dir')
361 if not pattern:
362 pattern = self.add_pattern
363 recurse_files(dir, pattern, exclusions, self.add_file)
364 # Rebuild the fileid index
365 self.fileids = {}
366 for fname in self.files.keys():
367 fileid = self.files[fname][0]
368 self.fileids[fileid] = fname
370 def add_file(self, fname, ftype='text/plain'):
371 "Index the contents of a regular file"
372 if self.files.has_key(fname): # Is file eligible for (re)indexing?
373 if self.reindex: # Reindexing enabled, cleanup dicts
374 self.purge_entry(fname, self.files, self.words)
375 else: # DO NOT reindex this file
376 if self.quiet < 5: print "Skipping", fname
377 return 0
379 # Read in the file (if possible)
380 try:
381 if fname[-3:] == '.gz':
382 text = gzip.open(fname).read()
383 else:
384 text = open(fname).read()
385 if self.quiet < 5: print "Indexing", fname
386 except IOError:
387 return 0
388 words = self.splitter(text, ftype)
390 # Find new file index, and assign it to filename
391 # (_TOP uses trick of negative to avoid conflict with file index)
392 self.files['_TOP'] = (self.files['_TOP'][0]-1, None)
393 file_index = abs(self.files['_TOP'][0])
394 self.files[fname] = (file_index, len(words))
396 filedict = {}
397 for word in words:
398 if filedict.has_key(word):
399 filedict[word] = filedict[word]+1
400 else:
401 filedict[word] = 1
403 for word in filedict.keys():
404 if self.words.has_key(word):
405 entry = self.words[word]
406 else:
407 entry = {}
408 entry[file_index] = filedict[word]
409 self.words[word] = entry
411 def add_othertext(self, identifier):
412 """Index a textual source other than a plain file
414 A child class might want to implement this method (or a similar one)
415 in order to index textual sources such as SQL tables, URLs, clay
416 tablets, or whatever else. The identifier should uniquely pick out
417 the source of the text (whatever it is)
418 """
419 raise NotImplementedError
421 def save_index(self, INDEXDB=None):
422 raise NotImplementedError
424 def load_index(self, INDEXDB=None, reload=0, wordlist=None):
425 raise NotImplementedError
427 def find(self, wordlist, print_report=0):
428 "Locate files that match ALL the words in wordlist"
429 self.load_index(wordlist=wordlist)
430 entries = {}
431 hits = copy.copy(self.fileids) # Copy of fileids index
432 for word in wordlist:
433 if not self.casesensitive:
434 word = string.upper(word)
435 entry = self.words.get(word) # For each word, get index
436 entries[word] = entry # of matching files
437 if not entry: # Nothing for this one word (fail)
438 return 0
439 for fileid in hits.keys(): # Eliminate hits for every non-match
440 if not entry.has_key(fileid):
441 del hits[fileid]
442 if print_report:
443 self.print_report(hits, wordlist, entries)
444 return hits
446 def print_report(self, hits={}, wordlist=[], entries={}):
447 # Figure out what to actually print (based on QUIET level)
448 output = []
449 for fileid,fname in hits.items():
450 message = fname
451 if self.quiet <= 3:
452 wordcount = self.files[fname][1]
453 matches = 0
454 countmess = '\n'+' '*13+`wordcount`+' words; '
455 for word in wordlist:
456 if not self.casesensitive:
457 word = string.upper(word)
458 occurs = entries[word][fileid]
459 matches = matches+occurs
460 countmess = countmess +`occurs`+' '+word+'; '
461 message = string.ljust('[RATING: '
462 +`1000*matches/wordcount`+']',13)+message
463 if self.quiet <= 2: message = message +countmess +'\n'
464 if self.filter: # Using an output filter
465 if fnmatch.fnmatch(message, self.filter):
466 output.append(message)
467 else:
468 output.append(message)
470 if self.quiet <= 5:
471 print string.join(output,'\n')
472 sys.stderr.write('\n'+`len(output)`+' files matched wordlist: '+
473 `wordlist`+'\n')
474 return output
476 def purge_entry(self, fname, file_dct, word_dct):
477 "Remove a file from file index and word index"
478 try: # The easy part, cleanup the file index
479 file_index = file_dct[fname]
480 del file_dct[fname]
481 except KeyError:
482 pass # We'll assume we only encounter KeyError's
483 # The much harder part, cleanup the word index
484 for word, occurs in word_dct.items():
485 if occurs.has_key(file_index):
486 del occurs[file_index]
487 word_dct[word] = occurs
489 def index_loaded(self):
490 return ( hasattr(self,'fileids') and
491 hasattr(self,'files') and
492 hasattr(self,'words') )
494 #-- Provide an actual storage facility for the indexes (i.e. shelve)
495 class ShelveIndexer(GenericIndexer, TextSplitter):
496 """Concrete Indexer utilizing [shelve] for storage
498 Unfortunately, [shelve] proves far too slow in indexing, while
499 creating monstrously large indexes. Not recommend, at least under
500 the default dbm's tested. Also, class may be broken because
501 shelves do not, apparently, support the .values() and .items()
502 methods. Fixing this is a low priority, but the sample code is
503 left here.
504 """
505 def load_index(self, INDEXDB=None, reload=0, wordlist=None):
506 INDEXDB = INDEXDB or self.indexdb
507 import shelve
508 self.words = shelve.open(INDEXDB+".WORDS")
509 self.files = shelve.open(INDEXDB+".FILES")
510 self.fileids = shelve.open(INDEXDB+".FILEIDS")
511 if not FILES: # New index
512 self.files['_TOP'] = (0,None)
514 def save_index(self, INDEXDB=None):
515 INDEXDB = INDEXDB or self.indexdb
516 pass
518 class FlatIndexer(GenericIndexer, TextSplitter):
519 """Concrete Indexer utilizing flat-file for storage
521 See the comments in the referenced article for details; in
522 brief, this indexer has about the same timing as the best in
523 -creating- indexes and the storage requirements are
524 reasonable. However, actually -using- a flat-file index is
525 more than an order of magnitude worse than the best indexer
526 (ZPickleIndexer wins overall).
528 On the other hand, FlatIndexer creates a wonderfully easy to
529 parse database format if you have a reason to transport the
530 index to a different platform or programming language. And
531 should you perform indexing as part of a long-running
532 process, the overhead of initial file parsing becomes
533 irrelevant.
534 """
535 def load_index(self, INDEXDB=None, reload=0, wordlist=None):
536 # Unless reload is indicated, do not load twice
537 if self.index_loaded() and not reload: return 0
538 # Ok, now let's actually load it
539 INDEXDB = INDEXDB or self.indexdb
540 self.words = {}
541 self.files = {'_TOP':(0,None)}
542 self.fileids = {}
543 try: # Read index contents
544 for line in open(INDEXDB).readlines():
545 fields = string.split(line)
546 if fields[0] == '-': # Read a file/fileid line
547 fileid = eval(fields[2])
548 wordcount = eval(fields[3])
549 fname = fields[1]
550 self.files[fname] = (fileid, wordcount)
551 self.fileids[fileid] = fname
552 else: # Read a word entry (dict of hits)
553 entries = {}
554 word = fields[0]
555 for n in range(1,len(fields),2):
556 fileid = eval(fields[n])
557 occurs = eval(fields[n+1])
558 entries[fileid] = occurs
559 self.words[word] = entries
560 except:
561 pass # New index
563 def save_index(self, INDEXDB=None):
564 INDEXDB = INDEXDB or self.indexdb
565 tab, lf, sp = '\t','\n',' '
566 indexdb = open(INDEXDB,'w')
567 for fname,entry in self.files.items():
568 indexdb.write('- '+fname +tab +`entry[0]` +tab +`entry[1]` +lf)
569 for word,entry in self.words.items():
570 indexdb.write(word +tab+tab)
571 for fileid,occurs in entry.items():
572 indexdb.write(`fileid` +sp +`occurs` +sp)
573 indexdb.write(lf)
575 class PickleIndexer(GenericIndexer, TextSplitter):
576 def load_index(self, INDEXDB=None, reload=0, wordlist=None):
577 # Unless reload is indicated, do not load twice
578 if self.index_loaded() and not reload: return 0
579 # Ok, now let's actually load it
580 import cPickle
581 INDEXDB = INDEXDB or self.indexdb
582 try:
583 pickle_str = open(INDEXDB,'rb').read()
584 db = cPickle.loads(pickle_str)
585 except: # New index
586 db = Index({}, {'_TOP':(0,None)}, {})
587 self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS
589 def save_index(self, INDEXDB=None):
590 import cPickle
591 INDEXDB = INDEXDB or self.indexdb
592 db = Index(self.words, self.files, self.fileids)
593 open(INDEXDB,'wb').write(cPickle.dumps(db, 1))
595 class XMLPickleIndexer(PickleIndexer):
596 """Concrete Indexer utilizing XML for storage
598 While this is, as expected, a verbose format, the possibility
599 of using XML as a transport format for indexes might be
600 useful. However, [xml_pickle] is in need of some redesign to
601 avoid gross inefficiency when creating very large
602 (multi-megabyte) output files (fixed in [xml_pickle] version
603 0.48 or above)
604 """
605 def load_index(self, INDEXDB=None, reload=0, wordlist=None):
606 # Unless reload is indicated, do not load twice
607 if self.index_loaded() and not reload: return 0
608 # Ok, now let's actually load it
609 from gnosis.xml.pickle import XML_Pickler
610 INDEXDB = INDEXDB or self.indexdb
611 try: # XML file exists
612 xml_str = open(INDEXDB).read()
613 db = XML_Pickler().loads(xml_str)
614 except: # New index
615 db = Index({}, {'_TOP':(0,None)}, {})
616 self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS
618 def save_index(self, INDEXDB=None):
619 from gnosis.xml.pickle import XML_Pickler
620 INDEXDB = INDEXDB or self.indexdb
621 db = Index(self.words, self.files, self.fileids)
622 open(INDEXDB,'w').write(XML_Pickler(db).dumps())
624 class ZPickleIndexer(PickleIndexer):
625 def load_index(self, INDEXDB=None, reload=0, wordlist=None):
626 # Unless reload is indicated, do not load twice
627 if self.index_loaded() and not reload: return 0
628 # Ok, now let's actually load it
629 import cPickle, zlib
630 INDEXDB = INDEXDB or self.indexdb
631 try:
632 pickle_str = zlib.decompress(open(INDEXDB+'!','rb').read())
633 db = cPickle.loads(pickle_str)
634 except: # New index
635 db = Index({}, {'_TOP':(0,None)}, {})
636 self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS
638 def save_index(self, INDEXDB=None):
639 import cPickle, zlib
640 INDEXDB = INDEXDB or self.indexdb
641 db = Index(self.words, self.files, self.fileids)
642 pickle_fh = open(INDEXDB+'!','wb')
643 pickle_fh.write(zlib.compress(cPickle.dumps(db, 1)))
646 class SlicedZPickleIndexer(ZPickleIndexer):
647 segments = "ABCDEFGHIJKLMNOPQRSTUVWXYZ#-!"
648 def load_index(self, INDEXDB=None, reload=0, wordlist=None):
649 # Unless reload is indicated, do not load twice
650 if self.index_loaded() and not reload: return 0
651 # Ok, now let's actually load it
652 import cPickle, zlib
653 INDEXDB = INDEXDB or self.indexdb
654 db = Index({}, {'_TOP':(0,None)}, {})
655 # Identify the relevant word-dictionary segments
656 if not wordlist:
657 segments = self.segments
658 else:
659 segments = ['-','#']
660 for word in wordlist:
661 segments.append(string.upper(word[0]))
662 # Load the segments
663 for segment in segments:
664 try:
665 pickle_str = zlib.decompress(open(INDEXDB+segment,'rb').read())
666 dbslice = cPickle.loads(pickle_str)
667 if dbslice.__dict__.get('WORDS'): # If it has some words, add them
668 for word,entry in dbslice.WORDS.items():
669 db.WORDS[word] = entry
670 if dbslice.__dict__.get('FILES'): # If it has some files, add them
671 db.FILES = dbslice.FILES
672 if dbslice.__dict__.get('FILEIDS'): # If it has fileids, add them
673 db.FILEIDS = dbslice.FILEIDS
674 except:
675 pass # No biggie, couldn't find this segment
676 self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS
678 def julienne(self, INDEXDB=None):
679 import cPickle, zlib
680 INDEXDB = INDEXDB or self.indexdb
681 segments = self.segments # all the (little) indexes
682 for segment in segments:
683 try: # brutal space saver... delete all the small segments
684 os.remove(INDEXDB+segment)
685 except OSError:
686 pass # probably just nonexistent segment index file
687 # First write the much simpler filename/fileid dictionaries
688 dbfil = Index(None, self.files, self.fileids)
689 open(INDEXDB+'-','wb').write(zlib.compress(cPickle.dumps(dbfil,1)))
690 # The hard part is splitting the word dictionary up, of course
691 letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
692 segdicts = {} # Need batch of empty dicts
693 for segment in letters+'#':
694 segdicts[segment] = {}
695 for word, entry in self.words.items(): # Split into segment dicts
696 initchar = string.upper(word[0])
697 if initchar in letters:
698 segdicts[initchar][word] = entry
699 else:
700 segdicts['#'][word] = entry
701 for initchar in letters+'#':
702 db = Index(segdicts[initchar], None, None)
703 pickle_str = cPickle.dumps(db, 1)
704 filename = INDEXDB+initchar
705 pickle_fh = open(filename,'wb')
706 pickle_fh.write(zlib.compress(pickle_str))
707 os.chmod(filename,0664)
709 save_index = julienne
711 PreferredIndexer = SlicedZPickleIndexer
713 #-- If called from command-line, parse arguments and take actions
714 if __name__ == '__main__':
715 import time
716 start = time.time()
717 search_words = [] # Word search list (if specified)
718 opts = 0 # Any options specified?
719 if len(sys.argv) < 2:
720 pass # No options given
721 else:
722 upper = string.upper
723 dir = os.getcwd() # Default to indexing from current directory
724 descend = 1 # Default to recursive indexing
725 ndx = PreferredIndexer()
726 for opt in sys.argv[1:]:
727 if opt in ('-h','/h','-?','/?','?','--help'): # help screen
728 print __shell_usage__
729 opts = -1
730 break
731 elif opt[0] in '/-': # a switch!
732 opts = opts+1
733 if upper(opt[1:]) == 'INDEX': # Index files
734 ndx.quiet = 0
735 pass # Use defaults if no other options
736 elif upper(opt[1:]) == 'REINDEX': # Reindex
737 ndx.reindex = 1
738 elif upper(opt[1:]) == 'CASESENSITIVE': # Case sensitive
739 ndx.casesensitive = 1
740 elif upper(opt[1:]) in ('NORECURSE','LOCAL'): # No recursion
741 descend = 0
742 elif upper(opt[1:4]) == 'DIR': # Dir to index
743 dir = opt[5:]
744 elif upper(opt[1:8]) == 'INDEXDB': # Index specified
745 ndx.indexdb = opt[9:]
746 sys.stderr.write(
747 "Use of INDEXER_DB environment variable is STRONGLY recommended.\n")
748 elif upper(opt[1:6]) == 'REGEX': # RegEx files to index
749 ndx.add_pattern = re.compile(opt[7:])
750 elif upper(opt[1:5]) == 'GLOB': # Glob files to index
751 ndx.add_pattern = opt[6:]
752 elif upper(opt[1:7]) in ('OUTPUT','FORMAT'): # How should results look?
753 opts = opts-1 # this is not an option for indexing purposes
754 level = upper(opt[8:])
755 if level in ('ALL','EVERYTHING','VERBOSE', 'MAX'):
756 ndx.quiet = 0
757 elif level in ('RATINGS','SCORES','HIGH'):
758 ndx.quiet = 3
759 elif level in ('FILENAMES','NAMES','FILES','MID'):
760 ndx.quiet = 5
761 elif level in ('SUMMARY','MIN'):
762 ndx.quiet = 9
763 elif upper(opt[1:7]) == 'FILTER': # Regex filter output
764 opts = opts-1 # this is not an option for indexing purposes
765 ndx.filter = opt[8:]
766 elif opt[1:] in string.digits:
767 opts = opts-1
768 ndx.quiet = eval(opt[1])
769 else:
770 search_words.append(opt) # Search words
772 if opts > 0:
773 ndx.add_files(dir=dir)
774 ndx.save_index()
775 if search_words:
776 ndx.find(search_words, print_report=1)
777 if not opts and not search_words:
778 sys.stderr.write("Perhaps you would like to use the --help option?\n")
779 else:
780 sys.stderr.write('Processed in %.3f seconds (%s)'
781 % (time.time()-start, ndx.whoami()))
783 #
784 #$Log: not supported by cvs2svn $
785 #Revision 1.1.2.3 2002/04/03 12:05:15 rochecompaan
786 #Removed dos control characters.
787 #
788 #Revision 1.1.2.2 2002/04/03 12:01:55 rochecompaan
789 #Oops. Forgot to include cvs keywords in file.
790 #