roundup/indexer.py

   1 #!/usr/bin/env python
   2
   3 """Create full-text indexes and search them
   4
   5 Notes:
   6
   7   See http://gnosis.cx/publish/programming/charming_python_15.txt
   8   for a detailed discussion of this module.
   9
  10   This version requires Python 1.6+.  It turns out that the use
  11   of string methods rather than [string] module functions is
  12   enough faster in a tight loop so as to provide a quite
  13   remarkable 25% speedup in overall indexing.  However, only FOUR
  14   lines in TextSplitter.text_splitter() were changed away from
  15   Python 1.5 compatibility.  Those lines are followed by comments
  16   beginning with "# 1.52:  " that show the old forms.  Python
  17   1.5 users can restore these lines, and comment out those just
  18   above them.
  19
  20 Classes:
  21
  22     GenericIndexer      -- Abstract class
  23     TextSplitter        -- Mixin class
  24     Index
  25     ShelveIndexer
  26     FlatIndexer
  27     XMLPickleIndexer
  28     PickleIndexer
  29     ZPickleIndexer
  30     SlicedZPickleIndexer
  31
  32 Functions:
  33
  34     echo_fname(fname)
  35     recurse_files(...)
  36
  37 Index Formats:
  38
  39     *Indexer.files:     filename --> (fileid, wordcount)
  40     *Indexer.fileids:   fileid --> filename
  41     *Indexer.words:     word --> {fileid1:occurs, fileid2:occurs, ...}
  42
  43 Module Usage:
  44
  45   There are a few ways to use this module.  Just to utilize existing
  46   functionality, something like the following is a likely
  47   pattern:
  48
  49       import gnosis.indexer as indexer
  50       index = indexer.MyFavoriteIndexer()     # For some concrete Indexer
  51       index.load_index('myIndex.db')
  52       index.add_files(dir='/this/that/otherdir', pattern='*.txt')
  53       hits = index.find(['spam','eggs','bacon'])
  54       index.print_report(hits)
  55
  56   To customize the basic classes, something like the following is likely:
  57
  58       class MySplitter:
  59           def splitter(self, text, ftype):
  60               "Peform much better splitting than default (for filetypes)"
  61               # ...
  62               return words
  63
  64       class MyIndexer(indexer.GenericIndexer, MySplitter):
  65           def load_index(self, INDEXDB=None):
  66               "Retrieve three dictionaries from clever storage method"
  67               # ...
  68               self.words, self.files, self.fileids = WORDS, FILES, FILEIDS
  69           def save_index(self, INDEXDB=None):
  70               "Save three dictionaries to clever storage method"
  71
  72       index = MyIndexer()
  73       # ...etc...
  74
  75 Benchmarks:
  76
  77   As we know, there are lies, damn lies, and benchmarks.  Take
  78   the below with an adequate dose of salt.  In version 0.10 of
  79   the concrete indexers, some performance was tested.  The
  80   test case was a set of mail/news archives, that were about
  81   43 mB, and 225 files.  In each case, an index was generated
  82   (if possible), and a search for the words "xml python" was
  83   performed.
  84
  85     - Index w/ PickleIndexer:     482s, 2.4 mB
  86     - Search w/ PickleIndexer:    1.74s
  87     - Index w/ ZPickleIndexer:    484s, 1.2 mB
  88     - Search w/ ZPickleIndexer:   1.77s
  89     - Index w/ FlatIndexer:       492s, 2.6 mB
  90     - Search w/ FlatIndexer:      53s
  91     - Index w/ ShelveIndexer:     (dumbdbm) Many minutes, tens of mBs
  92     - Search w/ ShelveIndexer:    Aborted before completely indexed
  93     - Index w/ ShelveIndexer:     (dbhash) Long time (partial crash), 10 mB
  94     - Search w/ ShelveIndexer:    N/A. Too many glitches
  95     - Index w/ XMLPickleIndexer:  Memory error (xml_pickle uses bad string
  96                                                 composition for large output)
  97     - Search w/ XMLPickleIndexer: N/A
  98     - grep search (xml|python):   20s (cached: <5s)
  99     - 'srch' utility (python):    12s
 100 """
 101 #$Id: indexer.py,v 1.2 2002-05-25 07:16:24 rochecompaan Exp $
 102
 103 __shell_usage__ = """
 104 Shell Usage: [python] indexer.py [options] [search_words]
 105
 106     -h, /h, -?, /?, ?, --help:    Show this help screen
 107     -index:                       Add files to index
 108     -reindex:                     Refresh files already in the index
 109                                   (can take much more time)
 110     -casesensitive:               Maintain the case of indexed words
 111                                   (can lead to MUCH larger indices)
 112     -norecurse, -local:           Only index starting dir, not subdirs
 113     -dir=<directory>:             Starting directory for indexing
 114                                   (default is current directory)
 115     -indexdb=<database>:          Use specified index database
 116                                   (environ variable INDEXER_DB is preferred)
 117     -regex=<pattern>:             Index files matching regular expression
 118     -glob=<pattern>:              Index files matching glob pattern
 119     -filter=<pattern>             Only display results matching pattern
 120     -output=<op>, -format=<opt>:  How much detail on matches?
 121     -<digit>:                     Quiet level (0=verbose ... 9=quiet)
 122
 123 Output/format options are ALL/EVERYTHING/VERBOSE, RATINGS/SCORES,
 124 FILENAMES/NAMES/FILES, SUMMARY/REPORT"""
 125
 126 __version__ = "$Revision: 1.2 $"
 127 __author__=["David Mertz (mertz@gnosis.cx)",]
 128 __thanks_to__=["Pat Knight (p.knight@ktgroup.co.uk)",
 129                "Gregory Popovitch (greg@gpy.com)", ]
 130 __copyright__="""
 131     This file is released to the public domain.  I (dqm) would
 132     appreciate it if you choose to keep derived works under terms
 133     that promote freedom, but obviously am giving up any rights
 134     to compel such.
 135 """
 136
 137 __history__="""
 138     0.1    Initial version.
 139
 140     0.11   Tweaked TextSplitter after some random experimentation.
 141
 142     0.12   Added SlicedZPickleIndexer (best choice, so far).
 143
 144     0.13   Pat Knight pointed out need for binary open()'s of
 145            certain files under Windows.
 146
 147     0.14   Added '-filter' switch to search results.
 148
 149     0.15   Added direct read of gzip files
 150
 151     0.20   Gregory Popovitch did some profiling on TextSplitter,
 152            and provided both huge speedups to the Python version
 153            and hooks to a C extension class (ZopeTextSplitter).
 154            A little refactoring by he and I (dqm) has nearly
 155            doubled the speed of indexing
 156
 157     0.30  Module refactored into gnosis package.  This is a
 158           first pass, and various documentation and test cases
 159           should be added later.
 160 """
 161 import string, re, os, fnmatch, sys, copy, gzip
 162 from types import *
 163
 164 #-- Silly "do nothing" default recursive file processor
 165 def echo_fname(fname): print fname
 166
 167 #-- "Recurse and process files" utility function
 168 def recurse_files(curdir, pattern, exclusions, func=echo_fname, *args, **kw):
 169     "Recursively process file pattern"
 170     subdirs, files = [],[]
 171     level = kw.get('level',0)
 172
 173     for name in os.listdir(curdir):
 174         fname = os.path.join(curdir, name)
 175         if name[-4:] in exclusions:
 176             pass            # do not include binary file type
 177         elif os.path.isdir(fname) and not os.path.islink(fname):
 178             subdirs.append(fname)
 179         # kludge to detect a regular expression across python versions
 180         elif sys.version[0]=='1' and isinstance(pattern, re.RegexObject):
 181             if pattern.match(name):
 182                 files.append(fname)
 183         elif sys.version[0]=='2' and type(pattern)==type(re.compile('')):
 184             if pattern.match(name):
 185                 files.append(fname)
 186         elif type(pattern) is StringType:
 187             if fnmatch.fnmatch(name, pattern):
 188                 files.append(fname)
 189
 190     for fname in files:
 191         apply(func, (fname,)+args)
 192     for subdir in subdirs:
 193         recurse_files(subdir, pattern, exclusions, func, level=level+1)
 194
 195 #-- Data bundle for index dictionaries
 196 class Index:
 197     def __init__(self, words, files, fileids):
 198         if words is not None:   self.WORDS = words
 199         if files is not None:   self.FILES = files
 200         if fileids is not None: self.FILEIDS = fileids
 201
 202 #-- "Split plain text into words" utility function
 203 class TextSplitter:
 204     def initSplitter(self):
 205         prenum  = string.join(map(chr, range(0,48)), '')
 206         num2cap = string.join(map(chr, range(58,65)), '')
 207         cap2low = string.join(map(chr, range(91,97)), '')
 208         postlow = string.join(map(chr, range(123,256)), '')
 209         nonword = prenum + num2cap + cap2low + postlow
 210         self.word_only = string.maketrans(nonword, " "*len(nonword))
 211         self.nondigits = string.join(map(chr, range(0,48)) + map(chr, range(58,255)), '')
 212         self.alpha = string.join(map(chr, range(65,91)) + map(chr, range(97,123)), '')
 213         self.ident = string.join(map(chr, range(256)), '')
 214         self.init = 1
 215
 216     def splitter(self, text, ftype):
 217         "Split the contents of a text string into a list of 'words'"
 218         if ftype == 'text/plain':
 219             words = self.text_splitter(text, self.casesensitive)
 220         else:
 221             raise NotImplementedError
 222         return words
 223
 224     def text_splitter(self, text, casesensitive=0):
 225         """Split text/plain string into a list of words
 226
 227         In version 0.20 this function is still fairly weak at
 228         identifying "real" words, and excluding gibberish
 229         strings.  As long as the indexer looks at "real" text
 230         files, it does pretty well; but if indexing of binary
 231         data is attempted, a lot of gibberish gets indexed.
 232         Suggestions on improving this are GREATLY APPRECIATED.
 233         """
 234         # Initialize some constants
 235         if not hasattr(self,'init'): self.initSplitter()
 236
 237         # Speedup trick: attributes into local scope
 238         word_only = self.word_only
 239         ident = self.ident
 240         alpha = self.alpha
 241         nondigits = self.nondigits
 242         translate = string.translate
 243
 244         # Let's adjust case if not case-sensitive
 245         if not casesensitive: text = string.upper(text)
 246
 247         # Split the raw text
 248         allwords = string.split(text)
 249
 250         # Finally, let's skip some words not worth indexing
 251         words = []
 252         for word in allwords:
 253             if len(word) > 25: continue         # too long (probably gibberish)
 254
 255             # Identify common patterns in non-word data (binary, UU/MIME, etc)
 256             num_nonalpha = len(word.translate(ident, alpha))
 257             numdigits    = len(word.translate(ident, nondigits))
 258             # 1.52: num_nonalpha = len(translate(word, ident, alpha))
 259             # 1.52: numdigits    = len(translate(word, ident, nondigits))
 260             if numdigits > len(word)-2:         # almost all digits
 261                 if numdigits > 5:               # too many digits is gibberish
 262                     continue                    # a moderate number is year/zipcode/etc
 263             elif num_nonalpha*3 > len(word):    # too much scattered nonalpha = gibberish
 264                 continue
 265
 266             word = word.translate(word_only)    # Let's strip funny byte values
 267             # 1.52: word = translate(word, word_only)
 268             subwords = word.split()             # maybe embedded non-alphanumeric
 269             # 1.52: subwords = string.split(word)
 270             for subword in subwords:            # ...so we might have subwords
 271                 if len(subword) <= 2: continue  # too short a subword
 272                 words.append(subword)
 273         return words
 274
 275 class  ZopeTextSplitter:
 276     def initSplitter(self):
 277         import Splitter
 278         stop_words=(
 279             'am', 'ii', 'iii', 'per', 'po', 're', 'a', 'about', 'above', 'across',
 280             'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone',
 281             'along', 'already', 'also', 'although', 'always', 'am', 'among',
 282             'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any',
 283             'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around',
 284             'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes',
 285             'becoming', 'been', 'before', 'beforehand', 'behind', 'being',
 286             'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both',
 287             'bottom', 'but', 'by', 'can', 'cannot', 'cant', 'con', 'could',
 288             'couldnt', 'cry', 'describe', 'detail', 'do', 'done', 'down', 'due',
 289             'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else',
 290             'elsewhere', 'empty', 'enough', 'even', 'ever', 'every', 'everyone',
 291             'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty',
 292             'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly',
 293             'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get',
 294             'give', 'go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her',
 295             'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers',
 296             'herself', 'him', 'himself', 'his', 'how', 'however', 'hundred', 'i',
 297             'ie', 'if', 'in', 'inc', 'indeed', 'interest', 'into', 'is', 'it',
 298             'its', 'itself', 'keep', 'last', 'latter', 'latterly', 'least',
 299             'less', 'made', 'many', 'may', 'me', 'meanwhile', 'might', 'mill',
 300             'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must',
 301             'my', 'myself', 'name', 'namely', 'neither', 'never', 'nevertheless',
 302             'next', 'nine', 'no', 'nobody', 'none', 'noone', 'nor', 'not',
 303             'nothing', 'now', 'nowhere', 'of', 'off', 'often', 'on', 'once',
 304             'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'our',
 305             'ours', 'ourselves', 'out', 'over', 'own', 'per', 'perhaps',
 306             'please', 'pre', 'put', 'rather', 're', 'same', 'see', 'seem',
 307             'seemed', 'seeming', 'seems', 'serious', 'several', 'she', 'should',
 308             'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some',
 309             'somehow', 'someone', 'something', 'sometime', 'sometimes',
 310             'somewhere', 'still', 'such', 'take', 'ten', 'than', 'that', 'the',
 311             'their', 'them', 'themselves', 'then', 'thence', 'there',
 312             'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these',
 313             'they', 'thick', 'thin', 'third', 'this', 'those', 'though', 'three',
 314             'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too',
 315             'toward', 'towards', 'twelve', 'twenty', 'two', 'un', 'under',
 316             'until', 'up', 'upon', 'us', 'very', 'via', 'was', 'we', 'well',
 317             'were', 'what', 'whatever', 'when', 'whence', 'whenever', 'where',
 318             'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon',
 319             'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever',
 320             'whole', 'whom', 'whose', 'why', 'will', 'with', 'within', 'without',
 321             'would', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves',
 322             )
 323         self.stop_word_dict={}
 324         for word in stop_words: self.stop_word_dict[word]=None
 325         self.splitterobj = Splitter.getSplitter()
 326         self.init = 1
 327
 328     def goodword(self, word):
 329         return len(word) < 25
 330
 331     def splitter(self, text, ftype):
 332         """never case-sensitive"""
 333         if not hasattr(self,'init'): self.initSplitter()
 334         return filter(self.goodword, self.splitterobj(text, self.stop_word_dict))
 335
 336
 337 #-- "Abstract" parent class for inherited indexers
 338 #   (does not handle storage in parent, other methods are primitive)
 339
 340 class GenericIndexer:
 341     def __init__(self, **kw):
 342         apply(self.configure, (), kw)
 343
 344     def whoami(self):
 345         return self.__class__.__name__
 346
 347     def configure(self, REINDEX=0, CASESENSITIVE=0,
 348                         INDEXDB=os.environ.get('INDEXER_DB', 'TEMP_NDX.DB'),
 349                         ADD_PATTERN='*', QUIET=5):
 350         "Configure settings used by indexing and storage/retrieval"
 351         self.indexdb = INDEXDB
 352         self.reindex = REINDEX
 353         self.casesensitive = CASESENSITIVE
 354         self.add_pattern = ADD_PATTERN
 355         self.quiet = QUIET
 356         self.filter = None
 357
 358     def add_files(self, dir=os.getcwd(), pattern=None, descend=1):
 359         self.load_index()
 360         exclusions = ('.zip','.pyc','.gif','.jpg','.dat','.dir')
 361         if not pattern:
 362              pattern = self.add_pattern
 363         recurse_files(dir, pattern, exclusions, self.add_file)
 364         # Rebuild the fileid index
 365         self.fileids = {}
 366         for fname in self.files.keys():
 367             fileid = self.files[fname][0]
 368             self.fileids[fileid] = fname
 369
 370     def add_file(self, fname, ftype='text/plain'):
 371         "Index the contents of a regular file"
 372         if self.files.has_key(fname):   # Is file eligible for (re)indexing?
 373             if self.reindex:            # Reindexing enabled, cleanup dicts
 374                 self.purge_entry(fname, self.files, self.words)
 375             else:                   # DO NOT reindex this file
 376                 if self.quiet < 5: print "Skipping", fname
 377                 return 0
 378
 379         # Read in the file (if possible)
 380         try:
 381             if fname[-3:] == '.gz':
 382                 text = gzip.open(fname).read()
 383             else:
 384                 text = open(fname).read()
 385             if self.quiet < 5: print "Indexing", fname
 386         except IOError:
 387             return 0
 388         words = self.splitter(text, ftype)
 389
 390         # Find new file index, and assign it to filename
 391         # (_TOP uses trick of negative to avoid conflict with file index)
 392         self.files['_TOP'] = (self.files['_TOP'][0]-1, None)
 393         file_index =  abs(self.files['_TOP'][0])
 394         self.files[fname] = (file_index, len(words))
 395
 396         filedict = {}
 397         for word in words:
 398             if filedict.has_key(word):
 399                 filedict[word] = filedict[word]+1
 400             else:
 401                 filedict[word] = 1
 402
 403         for word in filedict.keys():
 404             if self.words.has_key(word):
 405                 entry = self.words[word]
 406             else:
 407                 entry = {}
 408             entry[file_index] = filedict[word]
 409             self.words[word] = entry
 410
 411     def add_othertext(self, identifier):
 412         """Index a textual source other than a plain file
 413
 414         A child class might want to implement this method (or a similar one)
 415         in order to index textual sources such as SQL tables, URLs, clay
 416         tablets, or whatever else.  The identifier should uniquely pick out
 417         the source of the text (whatever it is)
 418         """
 419         raise NotImplementedError
 420
 421     def save_index(self, INDEXDB=None):
 422         raise NotImplementedError
 423
 424     def load_index(self, INDEXDB=None, reload=0, wordlist=None):
 425         raise NotImplementedError
 426
 427     def find(self, wordlist, print_report=0):
 428         "Locate files that match ALL the words in wordlist"
 429         self.load_index(wordlist=wordlist)
 430         entries = {}
 431         hits = copy.copy(self.fileids)      # Copy of fileids index
 432         for word in wordlist:
 433             if not self.casesensitive:
 434                 word = string.upper(word)
 435             entry = self.words.get(word)    # For each word, get index
 436             entries[word] = entry           #   of matching files
 437             if not entry:                   # Nothing for this one word (fail)
 438                 return 0
 439             for fileid in hits.keys():      # Eliminate hits for every non-match
 440                 if not entry.has_key(fileid):
 441                     del hits[fileid]
 442         if print_report:
 443             self.print_report(hits, wordlist, entries)
 444         return hits
 445
 446     def print_report(self, hits={}, wordlist=[], entries={}):
 447         # Figure out what to actually print (based on QUIET level)
 448         output = []
 449         for fileid,fname in hits.items():
 450             message = fname
 451             if self.quiet <= 3:
 452                 wordcount = self.files[fname][1]
 453                 matches = 0
 454                 countmess = '\n'+' '*13+`wordcount`+' words; '
 455                 for word in wordlist:
 456                     if not self.casesensitive:
 457                         word = string.upper(word)
 458                     occurs = entries[word][fileid]
 459                     matches = matches+occurs
 460                     countmess = countmess +`occurs`+' '+word+'; '
 461                 message = string.ljust('[RATING: '
 462                                        +`1000*matches/wordcount`+']',13)+message
 463                 if self.quiet <= 2: message = message +countmess +'\n'
 464             if self.filter:     # Using an output filter
 465                 if fnmatch.fnmatch(message, self.filter):
 466                     output.append(message)
 467             else:
 468                 output.append(message)
 469
 470         if self.quiet <= 5:
 471             print string.join(output,'\n')
 472         sys.stderr.write('\n'+`len(output)`+' files matched wordlist: '+
 473                          `wordlist`+'\n')
 474         return output
 475
 476     def purge_entry(self, fname, file_dct, word_dct):
 477         "Remove a file from file index and word index"
 478         try:        # The easy part, cleanup the file index
 479             file_index = file_dct[fname]
 480             del file_dct[fname]
 481         except KeyError:
 482             pass    # We'll assume we only encounter KeyError's
 483         # The much harder part, cleanup the word index
 484         for word, occurs in word_dct.items():
 485             if occurs.has_key(file_index):
 486                 del occurs[file_index]
 487                 word_dct[word] = occurs
 488
 489     def index_loaded(self):
 490         return ( hasattr(self,'fileids') and
 491                  hasattr(self,'files')   and
 492                  hasattr(self,'words')      )
 493
 494 #-- Provide an actual storage facility for the indexes (i.e. shelve)
 495 class ShelveIndexer(GenericIndexer, TextSplitter):
 496     """Concrete Indexer utilizing [shelve] for storage
 497
 498     Unfortunately, [shelve] proves far too slow in indexing, while
 499     creating monstrously large indexes.  Not recommend, at least under
 500     the default dbm's tested.  Also, class may be broken because
 501     shelves do not, apparently, support the .values() and .items()
 502     methods.  Fixing this is a low priority, but the sample code is
 503     left here.
 504     """
 505     def load_index(self, INDEXDB=None, reload=0, wordlist=None):
 506         INDEXDB = INDEXDB or self.indexdb
 507         import shelve
 508         self.words   = shelve.open(INDEXDB+".WORDS")
 509         self.files   = shelve.open(INDEXDB+".FILES")
 510         self.fileids = shelve.open(INDEXDB+".FILEIDS")
 511         if not FILES:            # New index
 512             self.files['_TOP'] = (0,None)
 513
 514     def save_index(self, INDEXDB=None):
 515         INDEXDB = INDEXDB or self.indexdb
 516         pass
 517
 518 class FlatIndexer(GenericIndexer, TextSplitter):
 519     """Concrete Indexer utilizing flat-file for storage
 520
 521     See the comments in the referenced article for details; in
 522     brief, this indexer has about the same timing as the best in
 523     -creating- indexes and the storage requirements are
 524     reasonable.  However, actually -using- a flat-file index is
 525     more than an order of magnitude worse than the best indexer
 526     (ZPickleIndexer wins overall).
 527
 528     On the other hand, FlatIndexer creates a wonderfully easy to
 529     parse database format if you have a reason to transport the
 530     index to a different platform or programming language.  And
 531     should you perform indexing as part of a long-running
 532     process, the overhead of initial file parsing becomes
 533     irrelevant.
 534     """
 535     def load_index(self, INDEXDB=None, reload=0, wordlist=None):
 536         # Unless reload is indicated, do not load twice
 537         if self.index_loaded() and not reload: return 0
 538         # Ok, now let's actually load it
 539         INDEXDB = INDEXDB or self.indexdb
 540         self.words = {}
 541         self.files = {'_TOP':(0,None)}
 542         self.fileids = {}
 543         try:                            # Read index contents
 544             for line in open(INDEXDB).readlines():
 545                 fields = string.split(line)
 546                 if fields[0] == '-':    # Read a file/fileid line
 547                     fileid = eval(fields[2])
 548                     wordcount = eval(fields[3])
 549                     fname = fields[1]
 550                     self.files[fname] = (fileid, wordcount)
 551                     self.fileids[fileid] = fname
 552                 else:                   # Read a word entry (dict of hits)
 553                     entries = {}
 554                     word = fields[0]
 555                     for n in range(1,len(fields),2):
 556                         fileid = eval(fields[n])
 557                         occurs = eval(fields[n+1])
 558                         entries[fileid] = occurs
 559                     self.words[word] = entries
 560         except:
 561             pass                    # New index
 562
 563     def save_index(self, INDEXDB=None):
 564         INDEXDB = INDEXDB or self.indexdb
 565         tab, lf, sp = '\t','\n',' '
 566         indexdb = open(INDEXDB,'w')
 567         for fname,entry in self.files.items():
 568             indexdb.write('- '+fname +tab +`entry[0]` +tab +`entry[1]` +lf)
 569         for word,entry in self.words.items():
 570             indexdb.write(word +tab+tab)
 571             for fileid,occurs in entry.items():
 572                 indexdb.write(`fileid` +sp +`occurs` +sp)
 573             indexdb.write(lf)
 574
 575 class PickleIndexer(GenericIndexer, TextSplitter):
 576     def load_index(self, INDEXDB=None, reload=0, wordlist=None):
 577         # Unless reload is indicated, do not load twice
 578         if self.index_loaded() and not reload: return 0
 579         # Ok, now let's actually load it
 580         import cPickle
 581         INDEXDB = INDEXDB or self.indexdb
 582         try:
 583             pickle_str =  open(INDEXDB,'rb').read()
 584             db = cPickle.loads(pickle_str)
 585         except:                     # New index
 586             db = Index({}, {'_TOP':(0,None)}, {})
 587         self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS
 588
 589     def save_index(self, INDEXDB=None):
 590         import cPickle
 591         INDEXDB = INDEXDB or self.indexdb
 592         db = Index(self.words, self.files, self.fileids)
 593         open(INDEXDB,'wb').write(cPickle.dumps(db, 1))
 594
 595 class XMLPickleIndexer(PickleIndexer):
 596     """Concrete Indexer utilizing XML for storage
 597
 598     While this is, as expected, a verbose format, the possibility
 599     of using XML as a transport format for indexes might be
 600     useful.  However, [xml_pickle] is in need of some redesign to
 601     avoid gross inefficiency when creating very large
 602     (multi-megabyte) output files (fixed in [xml_pickle] version
 603     0.48 or above)
 604     """
 605     def load_index(self, INDEXDB=None, reload=0, wordlist=None):
 606         # Unless reload is indicated, do not load twice
 607         if self.index_loaded() and not reload: return 0
 608         # Ok, now let's actually load it
 609         from gnosis.xml.pickle import XML_Pickler
 610         INDEXDB = INDEXDB or self.indexdb
 611         try:                        # XML file exists
 612             xml_str = open(INDEXDB).read()
 613             db = XML_Pickler().loads(xml_str)
 614         except:                     # New index
 615             db = Index({}, {'_TOP':(0,None)}, {})
 616         self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS
 617
 618     def save_index(self, INDEXDB=None):
 619         from gnosis.xml.pickle import XML_Pickler
 620         INDEXDB = INDEXDB or self.indexdb
 621         db = Index(self.words, self.files, self.fileids)
 622         open(INDEXDB,'w').write(XML_Pickler(db).dumps())
 623
 624 class ZPickleIndexer(PickleIndexer):
 625     def load_index(self, INDEXDB=None, reload=0, wordlist=None):
 626         # Unless reload is indicated, do not load twice
 627         if self.index_loaded() and not reload: return 0
 628         # Ok, now let's actually load it
 629         import cPickle, zlib
 630         INDEXDB = INDEXDB or self.indexdb
 631         try:
 632             pickle_str =  zlib.decompress(open(INDEXDB+'!','rb').read())
 633             db = cPickle.loads(pickle_str)
 634         except:                     # New index
 635             db = Index({}, {'_TOP':(0,None)}, {})
 636         self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS
 637
 638     def save_index(self, INDEXDB=None):
 639         import cPickle, zlib
 640         INDEXDB = INDEXDB or self.indexdb
 641         db = Index(self.words, self.files, self.fileids)
 642         pickle_fh = open(INDEXDB+'!','wb')
 643         pickle_fh.write(zlib.compress(cPickle.dumps(db, 1)))
 644
 645
 646 class SlicedZPickleIndexer(ZPickleIndexer):
 647     segments = "ABCDEFGHIJKLMNOPQRSTUVWXYZ#-!"
 648     def load_index(self, INDEXDB=None, reload=0, wordlist=None):
 649         # Unless reload is indicated, do not load twice
 650         if self.index_loaded() and not reload: return 0
 651         # Ok, now let's actually load it
 652         import cPickle, zlib
 653         INDEXDB = INDEXDB or self.indexdb
 654         db = Index({}, {'_TOP':(0,None)}, {})
 655         # Identify the relevant word-dictionary segments
 656         if not wordlist:
 657             segments = self.segments
 658         else:
 659             segments = ['-','#']
 660             for word in wordlist:
 661                 segments.append(string.upper(word[0]))
 662         # Load the segments
 663         for segment in segments:
 664             try:
 665                 pickle_str = zlib.decompress(open(INDEXDB+segment,'rb').read())
 666                 dbslice = cPickle.loads(pickle_str)
 667                 if dbslice.__dict__.get('WORDS'):   # If it has some words, add them
 668                     for word,entry in dbslice.WORDS.items():
 669                         db.WORDS[word] = entry
 670                 if dbslice.__dict__.get('FILES'):   # If it has some files, add them
 671                     db.FILES = dbslice.FILES
 672                 if dbslice.__dict__.get('FILEIDS'): # If it has fileids, add them
 673                     db.FILEIDS = dbslice.FILEIDS
 674             except:
 675                 pass    # No biggie, couldn't find this segment
 676         self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS
 677
 678     def julienne(self, INDEXDB=None):
 679         import cPickle, zlib
 680         INDEXDB = INDEXDB or self.indexdb
 681         segments = self.segments       # all the (little) indexes
 682         for segment in segments:
 683             try:        # brutal space saver... delete all the small segments
 684                 os.remove(INDEXDB+segment)
 685             except OSError:
 686                 pass    # probably just nonexistent segment index file
 687         # First write the much simpler filename/fileid dictionaries
 688         dbfil = Index(None, self.files, self.fileids)
 689         open(INDEXDB+'-','wb').write(zlib.compress(cPickle.dumps(dbfil,1)))
 690         # The hard part is splitting the word dictionary up, of course
 691         letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 692         segdicts = {}                           # Need batch of empty dicts
 693         for segment in letters+'#':
 694             segdicts[segment] = {}
 695         for word, entry in self.words.items():  # Split into segment dicts
 696             initchar = string.upper(word[0])
 697             if initchar in letters:
 698                 segdicts[initchar][word] = entry
 699             else:
 700                 segdicts['#'][word] = entry
 701         for initchar in letters+'#':
 702             db = Index(segdicts[initchar], None, None)
 703             pickle_str = cPickle.dumps(db, 1)
 704             filename = INDEXDB+initchar
 705             pickle_fh = open(filename,'wb')
 706             pickle_fh.write(zlib.compress(pickle_str))
 707             os.chmod(filename,0664)
 708
 709     save_index = julienne
 710
 711 PreferredIndexer = SlicedZPickleIndexer
 712
 713 #-- If called from command-line, parse arguments and take actions
 714 if __name__ == '__main__':
 715     import time
 716     start = time.time()
 717     search_words = []           # Word search list (if specified)
 718     opts = 0                    # Any options specified?
 719     if len(sys.argv) < 2:
 720         pass                    # No options given
 721     else:
 722         upper = string.upper
 723         dir = os.getcwd()       # Default to indexing from current directory
 724         descend = 1             # Default to recursive indexing
 725         ndx = PreferredIndexer()
 726         for opt in sys.argv[1:]:
 727             if opt in ('-h','/h','-?','/?','?','--help'):   # help screen
 728                 print __shell_usage__
 729                 opts = -1
 730                 break
 731             elif opt[0] in '/-':                            # a switch!
 732                 opts = opts+1
 733                 if upper(opt[1:]) == 'INDEX':               # Index files
 734                     ndx.quiet = 0
 735                     pass     # Use defaults if no other options
 736                 elif upper(opt[1:]) == 'REINDEX':           # Reindex
 737                     ndx.reindex = 1
 738                 elif upper(opt[1:]) == 'CASESENSITIVE':     # Case sensitive
 739                     ndx.casesensitive = 1
 740                 elif upper(opt[1:]) in ('NORECURSE','LOCAL'): # No recursion
 741                     descend = 0
 742                 elif upper(opt[1:4]) == 'DIR':              # Dir to index
 743                     dir = opt[5:]
 744                 elif upper(opt[1:8]) == 'INDEXDB':          # Index specified
 745                     ndx.indexdb = opt[9:]
 746                     sys.stderr.write(
 747                         "Use of INDEXER_DB environment variable is STRONGLY recommended.\n")
 748                 elif upper(opt[1:6]) == 'REGEX':            # RegEx files to index
 749                     ndx.add_pattern = re.compile(opt[7:])
 750                 elif upper(opt[1:5]) == 'GLOB':             # Glob files to index
 751                     ndx.add_pattern = opt[6:]
 752                 elif upper(opt[1:7]) in ('OUTPUT','FORMAT'): # How should results look?
 753                     opts = opts-1   # this is not an option for indexing purposes
 754                     level = upper(opt[8:])
 755                     if level in ('ALL','EVERYTHING','VERBOSE', 'MAX'):
 756                         ndx.quiet = 0
 757                     elif level in ('RATINGS','SCORES','HIGH'):
 758                         ndx.quiet = 3
 759                     elif level in ('FILENAMES','NAMES','FILES','MID'):
 760                         ndx.quiet = 5
 761                     elif level in ('SUMMARY','MIN'):
 762                         ndx.quiet = 9
 763                 elif upper(opt[1:7]) == 'FILTER':           # Regex filter output
 764                     opts = opts-1   # this is not an option for indexing purposes
 765                     ndx.filter = opt[8:]
 766                 elif opt[1:] in string.digits:
 767                     opts = opts-1
 768                     ndx.quiet = eval(opt[1])
 769             else:
 770                 search_words.append(opt)                    # Search words
 771
 772     if opts > 0:
 773         ndx.add_files(dir=dir)
 774         ndx.save_index()
 775     if search_words:
 776         ndx.find(search_words, print_report=1)
 777     if not opts and not search_words:
 778         sys.stderr.write("Perhaps you would like to use the --help option?\n")
 779     else:
 780         sys.stderr.write('Processed in %.3f seconds (%s)'
 781                           % (time.time()-start, ndx.whoami()))
 782
 783 #
 784 #$Log: not supported by cvs2svn $
 785 #Revision 1.1.2.3  2002/04/03 12:05:15  rochecompaan
 786 #Removed dos control characters.
 787 #
 788 #Revision 1.1.2.2  2002/04/03 12:01:55  rochecompaan
 789 #Oops. Forgot to include cvs keywords in file.
 790 #