oops

[roundup.git] / roundup / indexer.py
diff --git a/roundup/indexer.py b/roundup/indexer.py

index 47f0120366129ba431368788dffcb4222e53dd40..814425a5848798585704549ad35f965be45e19c8 100644 (file)
--- a/roundup/indexer.py
+++ b/roundup/indexer.py
@@ -1,398 +1,95 @@
-#!/usr/bin/env python
-
-"""Create full-text indexes and search them
-
-Notes:
-
-  See http://gnosis.cx/publish/programming/charming_python_15.txt
-  for a detailed discussion of this module.
-
-  This version requires Python 1.6+.  It turns out that the use
-  of string methods rather than [string] module functions is
-  enough faster in a tight loop so as to provide a quite
-  remarkable 25% speedup in overall indexing.  However, only FOUR
-  lines in TextSplitter.text_splitter() were changed away from
-  Python 1.5 compatibility.  Those lines are followed by comments
-  beginning with "# 1.52:  " that show the old forms.  Python
-  1.5 users can restore these lines, and comment out those just
-  above them.
-
-Classes:
-
-    GenericIndexer      -- Abstract class
-    TextSplitter        -- Mixin class
-    Index
-    ShelveIndexer
-    FlatIndexer
-    XMLPickleIndexer
-    PickleIndexer
-    ZPickleIndexer
-    SlicedZPickleIndexer
-
-Functions:
-
-    echo_fname(fname)
-    recurse_files(...)
-
-Index Formats:
-
-    *Indexer.files:     filename --> (fileid, wordcount)
-    *Indexer.fileids:   fileid --> filename
-    *Indexer.words:     word --> {fileid1:occurs, fileid2:occurs, ...}
-
-Module Usage:
-
-  There are a few ways to use this module.  Just to utilize existing
-  functionality, something like the following is a likely
-  pattern:
-
-      import gnosis.indexer as indexer
-      index = indexer.MyFavoriteIndexer()     # For some concrete Indexer
-      index.load_index('myIndex.db')
-      index.add_files(dir='/this/that/otherdir', pattern='*.txt')
-      hits = index.find(['spam','eggs','bacon'])
-      index.print_report(hits)
-
-  To customize the basic classes, something like the following is likely:
-
-      class MySplitter:
-          def splitter(self, text, ftype):
-              "Peform much better splitting than default (for filetypes)"
-              # ...
-              return words
-
-      class MyIndexer(indexer.GenericIndexer, MySplitter):
-          def load_index(self, INDEXDB=None):
-              "Retrieve three dictionaries from clever storage method"
-              # ...
-              self.words, self.files, self.fileids = WORDS, FILES, FILEIDS
-          def save_index(self, INDEXDB=None):
-              "Save three dictionaries to clever storage method"
-
-      index = MyIndexer()
-      # ...etc...
-
-Benchmarks:
-
-  As we know, there are lies, damn lies, and benchmarks.  Take
-  the below with an adequate dose of salt.  In version 0.10 of
-  the concrete indexers, some performance was tested.  The
-  test case was a set of mail/news archives, that were about
-  43 mB, and 225 files.  In each case, an index was generated
-  (if possible), and a search for the words "xml python" was
-  performed.
-
-    - Index w/ PickleIndexer:     482s, 2.4 mB
-    - Search w/ PickleIndexer:    1.74s
-    - Index w/ ZPickleIndexer:    484s, 1.2 mB
-    - Search w/ ZPickleIndexer:   1.77s
-    - Index w/ FlatIndexer:       492s, 2.6 mB
-    - Search w/ FlatIndexer:      53s
-    - Index w/ ShelveIndexer:     (dumbdbm) Many minutes, tens of mBs
-    - Search w/ ShelveIndexer:    Aborted before completely indexed
-    - Index w/ ShelveIndexer:     (dbhash) Long time (partial crash), 10 mB
-    - Search w/ ShelveIndexer:    N/A. Too many glitches
-    - Index w/ XMLPickleIndexer:  Memory error (xml_pickle uses bad string
-                                                composition for large output)
-    - Search w/ XMLPickleIndexer: N/A
-    - grep search (xml|python):   20s (cached: <5s)
-    - 'srch' utility (python):    12s
-"""
-#$Id: indexer.py,v 1.2 2002-05-25 07:16:24 rochecompaan Exp $
-
-__shell_usage__ = """
-Shell Usage: [python] indexer.py [options] [search_words]
-
-    -h, /h, -?, /?, ?, --help:    Show this help screen
-    -index:                       Add files to index
-    -reindex:                     Refresh files already in the index
-                                  (can take much more time)
-    -casesensitive:               Maintain the case of indexed words
-                                  (can lead to MUCH larger indices)
-    -norecurse, -local:           Only index starting dir, not subdirs
-    -dir=<directory>:             Starting directory for indexing
-                                  (default is current directory)
-    -indexdb=<database>:          Use specified index database
-                                  (environ variable INDEXER_DB is preferred)
-    -regex=<pattern>:             Index files matching regular expression
-    -glob=<pattern>:              Index files matching glob pattern
-    -filter=<pattern>             Only display results matching pattern
-    -output=<op>, -format=<opt>:  How much detail on matches?
-    -<digit>:                     Quiet level (0=verbose ... 9=quiet)
-
-Output/format options are ALL/EVERYTHING/VERBOSE, RATINGS/SCORES,
-FILENAMES/NAMES/FILES, SUMMARY/REPORT"""
-
-__version__ = "$Revision: 1.2 $"
-__author__=["David Mertz (mertz@gnosis.cx)",]
-__thanks_to__=["Pat Knight (p.knight@ktgroup.co.uk)",
-               "Gregory Popovitch (greg@gpy.com)", ]
-__copyright__="""
-    This file is released to the public domain.  I (dqm) would
-    appreciate it if you choose to keep derived works under terms
-    that promote freedom, but obviously am giving up any rights
-    to compel such.
-"""
-
-__history__="""
-    0.1    Initial version.
-
-    0.11   Tweaked TextSplitter after some random experimentation.
-
-    0.12   Added SlicedZPickleIndexer (best choice, so far).
-
-    0.13   Pat Knight pointed out need for binary open()'s of
-           certain files under Windows.
-
-    0.14   Added '-filter' switch to search results.
-
-    0.15   Added direct read of gzip files
-
-    0.20   Gregory Popovitch did some profiling on TextSplitter,
-           and provided both huge speedups to the Python version
-           and hooks to a C extension class (ZopeTextSplitter).
-           A little refactoring by he and I (dqm) has nearly
-           doubled the speed of indexing
-
-    0.30  Module refactored into gnosis package.  This is a
-          first pass, and various documentation and test cases
-          should be added later.
-"""
-import string, re, os, fnmatch, sys, copy, gzip
-from types import *
-
-#-- Silly "do nothing" default recursive file processor
-def echo_fname(fname): print fname
-
-#-- "Recurse and process files" utility function
-def recurse_files(curdir, pattern, exclusions, func=echo_fname, *args, **kw):
-    "Recursively process file pattern"
-    subdirs, files = [],[]
-    level = kw.get('level',0)
-
-    for name in os.listdir(curdir):
-        fname = os.path.join(curdir, name)
-        if name[-4:] in exclusions:
-            pass            # do not include binary file type
-        elif os.path.isdir(fname) and not os.path.islink(fname):
-            subdirs.append(fname)
-        # kludge to detect a regular expression across python versions
-        elif sys.version[0]=='1' and isinstance(pattern, re.RegexObject):
-            if pattern.match(name):
-                files.append(fname)
-        elif sys.version[0]=='2' and type(pattern)==type(re.compile('')):
-            if pattern.match(name):
-                files.append(fname)
-        elif type(pattern) is StringType:
-            if fnmatch.fnmatch(name, pattern):
-                files.append(fname)
-
-    for fname in files:
-        apply(func, (fname,)+args)
-    for subdir in subdirs:
-        recurse_files(subdir, pattern, exclusions, func, level=level+1)
-
-#-- Data bundle for index dictionaries
-class Index:
-    def __init__(self, words, files, fileids):
-        if words is not None:   self.WORDS = words
-        if files is not None:   self.FILES = files
-        if fileids is not None: self.FILEIDS = fileids
-
-#-- "Split plain text into words" utility function
-class TextSplitter:
-    def initSplitter(self):
-        prenum  = string.join(map(chr, range(0,48)), '')
-        num2cap = string.join(map(chr, range(58,65)), '')
-        cap2low = string.join(map(chr, range(91,97)), '')
-        postlow = string.join(map(chr, range(123,256)), '')
-        nonword = prenum + num2cap + cap2low + postlow
-        self.word_only = string.maketrans(nonword, " "*len(nonword))
-        self.nondigits = string.join(map(chr, range(0,48)) + map(chr, range(58,255)), '')
-        self.alpha = string.join(map(chr, range(65,91)) + map(chr, range(97,123)), '')
-        self.ident = string.join(map(chr, range(256)), '')
-        self.init = 1
-
-    def splitter(self, text, ftype):
-        "Split the contents of a text string into a list of 'words'"
-        if ftype == 'text/plain':
-            words = self.text_splitter(text, self.casesensitive)
-        else:
-            raise NotImplementedError
-        return words
-
-    def text_splitter(self, text, casesensitive=0):
-        """Split text/plain string into a list of words
-
-        In version 0.20 this function is still fairly weak at
-        identifying "real" words, and excluding gibberish
-        strings.  As long as the indexer looks at "real" text
-        files, it does pretty well; but if indexing of binary
-        data is attempted, a lot of gibberish gets indexed.
-        Suggestions on improving this are GREATLY APPRECIATED.
-        """
-        # Initialize some constants
-        if not hasattr(self,'init'): self.initSplitter()
-
-        # Speedup trick: attributes into local scope
-        word_only = self.word_only
-        ident = self.ident
-        alpha = self.alpha
-        nondigits = self.nondigits
-        translate = string.translate
-
-        # Let's adjust case if not case-sensitive
-        if not casesensitive: text = string.upper(text)
-
-        # Split the raw text
-        allwords = string.split(text)
-
-        # Finally, let's skip some words not worth indexing
-        words = []
-        for word in allwords:
-            if len(word) > 25: continue         # too long (probably gibberish)
-
-            # Identify common patterns in non-word data (binary, UU/MIME, etc)
-            num_nonalpha = len(word.translate(ident, alpha))
-            numdigits    = len(word.translate(ident, nondigits))
-            # 1.52: num_nonalpha = len(translate(word, ident, alpha))
-            # 1.52: numdigits    = len(translate(word, ident, nondigits))
-            if numdigits > len(word)-2:         # almost all digits
-                if numdigits > 5:               # too many digits is gibberish
-                    continue                    # a moderate number is year/zipcode/etc
-            elif num_nonalpha*3 > len(word):    # too much scattered nonalpha = gibberish
-                continue
-
-            word = word.translate(word_only)    # Let's strip funny byte values
-            # 1.52: word = translate(word, word_only)
-            subwords = word.split()             # maybe embedded non-alphanumeric
-            # 1.52: subwords = string.split(word)
-            for subword in subwords:            # ...so we might have subwords
-                if len(subword) <= 2: continue  # too short a subword
-                words.append(subword)
-        return words
-
-class  ZopeTextSplitter:
-    def initSplitter(self):
-        import Splitter
-        stop_words=(
-            'am', 'ii', 'iii', 'per', 'po', 're', 'a', 'about', 'above', 'across',
-            'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone',
-            'along', 'already', 'also', 'although', 'always', 'am', 'among',
-            'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any',
-            'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around',
-            'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes',
-            'becoming', 'been', 'before', 'beforehand', 'behind', 'being',
-            'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both',
-            'bottom', 'but', 'by', 'can', 'cannot', 'cant', 'con', 'could',
-            'couldnt', 'cry', 'describe', 'detail', 'do', 'done', 'down', 'due',
-            'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else',
-            'elsewhere', 'empty', 'enough', 'even', 'ever', 'every', 'everyone',
-            'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty',
-            'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly',
-            'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get',
-            'give', 'go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her',
-            'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers',
-            'herself', 'him', 'himself', 'his', 'how', 'however', 'hundred', 'i',
-            'ie', 'if', 'in', 'inc', 'indeed', 'interest', 'into', 'is', 'it',
-            'its', 'itself', 'keep', 'last', 'latter', 'latterly', 'least',
-            'less', 'made', 'many', 'may', 'me', 'meanwhile', 'might', 'mill',
-            'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must',
-            'my', 'myself', 'name', 'namely', 'neither', 'never', 'nevertheless',
-            'next', 'nine', 'no', 'nobody', 'none', 'noone', 'nor', 'not',
-            'nothing', 'now', 'nowhere', 'of', 'off', 'often', 'on', 'once',
-            'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'our',
-            'ours', 'ourselves', 'out', 'over', 'own', 'per', 'perhaps',
-            'please', 'pre', 'put', 'rather', 're', 'same', 'see', 'seem',
-            'seemed', 'seeming', 'seems', 'serious', 'several', 'she', 'should',
-            'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some',
-            'somehow', 'someone', 'something', 'sometime', 'sometimes',
-            'somewhere', 'still', 'such', 'take', 'ten', 'than', 'that', 'the',
-            'their', 'them', 'themselves', 'then', 'thence', 'there',
-            'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these',
-            'they', 'thick', 'thin', 'third', 'this', 'those', 'though', 'three',
-            'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too',
-            'toward', 'towards', 'twelve', 'twenty', 'two', 'un', 'under',
-            'until', 'up', 'upon', 'us', 'very', 'via', 'was', 'we', 'well',
-            'were', 'what', 'whatever', 'when', 'whence', 'whenever', 'where',
-            'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon',
-            'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever',
-            'whole', 'whom', 'whose', 'why', 'will', 'with', 'within', 'without',
-            'would', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves',
-            )
-        self.stop_word_dict={}
-        for word in stop_words: self.stop_word_dict[word]=None
-        self.splitterobj = Splitter.getSplitter()
-        self.init = 1
-
-    def goodword(self, word):
-        return len(word) < 25
-
-    def splitter(self, text, ftype):
-        """never case-sensitive"""
-        if not hasattr(self,'init'): self.initSplitter()
-        return filter(self.goodword, self.splitterobj(text, self.stop_word_dict))
-
-
-#-- "Abstract" parent class for inherited indexers
-#   (does not handle storage in parent, other methods are primitive)
-
-class GenericIndexer:
-    def __init__(self, **kw):
-        apply(self.configure, (), kw)
-
-    def whoami(self):
-        return self.__class__.__name__
+#
+# This module is derived from the module described at:
+#   http://gnosis.cx/publish/programming/charming_python_15.txt
+# 
+# Author: David Mertz (mertz@gnosis.cx)
+# Thanks to: Pat Knight (p.knight@ktgroup.co.uk)
+#            Gregory Popovitch (greg@gpy.com)
+# 
+# The original module was released under this license, and remains under
+# it:
+#
+#     This file is released to the public domain.  I (dqm) would
+#     appreciate it if you choose to keep derived works under terms
+#     that promote freedom, but obviously am giving up any rights
+#     to compel such.
+# 
+#$Id: indexer.py,v 1.10 2002-07-14 23:17:24 richard Exp $
+'''
+This module provides an indexer class, RoundupIndexer, that stores text
+indices in a roundup instance.  This class makes searching the content of
+messages, string properties and text files possible.
+'''
+import os, shutil, re, mimetypes, marshal, zlib, errno
+from hyperdb import Link, Multilink
+
+class Indexer:
+    ''' Indexes information from roundup's hyperdb to allow efficient
+        searching.
+
+        Three structures are created by the indexer:
+          files   {identifier: (fileid, wordcount)}
+          words   {word: {fileid: count}}
+          fileids {fileid: identifier}
+        where identifier is (classname, nodeid, propertyname)
+    '''
+    def __init__(self, db_path):
+        self.indexdb_path = os.path.join(db_path, 'indexes')
+        self.indexdb = os.path.join(self.indexdb_path, 'index.db')
+        self.reindex = 0
+        self.quiet = 9
+        self.changed = 0
+
+        # see if we need to reindex because of a change in code
+        version = os.path.join(self.indexdb_path, 'version')
+        if (not os.path.exists(self.indexdb_path) or
+                not os.path.exists(version)):
+            # for now the file itself is a flag
+            self.force_reindex()
+        elif os.path.exists(version):
+            version = open(version).read()
+            # check the value and reindex if it's not the latest
+            if version != '1':
+                self.force_reindex()
+
+    def force_reindex(self):
+        '''Force a reindex condition
+        '''
+        if os.path.exists(self.indexdb_path):
+            shutil.rmtree(self.indexdb_path)
+        os.makedirs(self.indexdb_path)
+        os.chmod(self.indexdb_path, 0775)
+        open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n')
+        self.reindex = 1
+        self.changed = 1
+
+    def should_reindex(self):
+        '''Should we reindex?
+        '''
+        return self.reindex
+
+    def add_text(self, identifier, text, mime_type='text/plain'):
+        ''' Add some text associated with the (classname, nodeid, property)
+            identifier.
+        '''
+        # make sure the index is loaded
+        self.load_index()
  
-    def configure(self, REINDEX=0, CASESENSITIVE=0,
-                        INDEXDB=os.environ.get('INDEXER_DB', 'TEMP_NDX.DB'),
-                        ADD_PATTERN='*', QUIET=5):
-        "Configure settings used by indexing and storage/retrieval"
-        self.indexdb = INDEXDB
-        self.reindex = REINDEX
-        self.casesensitive = CASESENSITIVE
-        self.add_pattern = ADD_PATTERN
-        self.quiet = QUIET
-        self.filter = None
+        # remove old entries for this identifier
+        if self.files.has_key(identifier):
+            self.purge_entry(identifier)
  
-    def add_files(self, dir=os.getcwd(), pattern=None, descend=1):
-        self.load_index()
-        exclusions = ('.zip','.pyc','.gif','.jpg','.dat','.dir')
-        if not pattern:
-             pattern = self.add_pattern
-        recurse_files(dir, pattern, exclusions, self.add_file)
-        # Rebuild the fileid index
-        self.fileids = {}
-        for fname in self.files.keys():
-            fileid = self.files[fname][0]
-            self.fileids[fileid] = fname
-
-    def add_file(self, fname, ftype='text/plain'):
-        "Index the contents of a regular file"
-        if self.files.has_key(fname):   # Is file eligible for (re)indexing?
-            if self.reindex:            # Reindexing enabled, cleanup dicts
-                self.purge_entry(fname, self.files, self.words)
-            else:                   # DO NOT reindex this file
-                if self.quiet < 5: print "Skipping", fname
-                return 0
-
-        # Read in the file (if possible)
-        try:
-            if fname[-3:] == '.gz':
-                text = gzip.open(fname).read()
-            else:
-                text = open(fname).read()
-            if self.quiet < 5: print "Indexing", fname
-        except IOError:
-            return 0
-        words = self.splitter(text, ftype)
+        # split into words
+        words = self.splitter(text, mime_type)
  
-        # Find new file index, and assign it to filename
+        # Find new file index, and assign it to identifier
          # (_TOP uses trick of negative to avoid conflict with file index)
          self.files['_TOP'] = (self.files['_TOP'][0]-1, None)
-        file_index =  abs(self.files['_TOP'][0])
-        self.files[fname] = (file_index, len(words))
+        file_index = abs(self.files['_TOP'][0])
+        self.files[identifier] = (file_index, len(words))
+        self.fileids[file_index] = identifier
  
+        # find the unique words
          filedict = {}
          for word in words:
              if filedict.has_key(word):
@@ -400,391 +97,305 @@ class GenericIndexer:
              else:
                  filedict[word] = 1
  
+        # now add to the totals
          for word in filedict.keys():
+            # each word has a dict of {identifier: count}
              if self.words.has_key(word):
                  entry = self.words[word]
              else:
+                # new word
                  entry = {}
+                self.words[word] = entry
+
+            # make a reference to the file for this word
              entry[file_index] = filedict[word]
-            self.words[word] = entry
  
-    def add_othertext(self, identifier):
-        """Index a textual source other than a plain file
+        # save needed
+        self.changed = 1
  
-        A child class might want to implement this method (or a similar one)
-        in order to index textual sources such as SQL tables, URLs, clay
-        tablets, or whatever else.  The identifier should uniquely pick out
-        the source of the text (whatever it is)
-        """
-        raise NotImplementedError
+    def splitter(self, text, ftype):
+        ''' Split the contents of a text string into a list of 'words'
+        '''
+        if ftype == 'text/plain':
+            words = self.text_splitter(text)
+        else:
+            return []
+        return words
  
-    def save_index(self, INDEXDB=None):
-        raise NotImplementedError
+    def text_splitter(self, text):
+        """Split text/plain string into a list of words
+        """
+        # case insensitive
+        text = text.upper()
+
+        # Split the raw text, losing anything longer than 25 characters
+        # since that'll be gibberish (encoded text or somesuch) or shorter
+        # than 3 characters since those short words appear all over the
+        # place
+        return re.findall(r'\b\w{2,25}\b', text)
+
+    def search(self, search_terms, klass, ignore={},
+            dre=re.compile(r'([^\d]+)(\d+)')):
+        ''' Display search results looking for [search, terms] associated
+            with the hyperdb Class "klass". Ignore hits on {class: property}.
+
+            "dre" is a helper, not an argument.
+        '''
+        # do the index lookup
+        hits = self.find(search_terms)
+        if not hits:
+            return {}
+
+        #designator_propname = {'msg': 'messages', 'file': 'files'}
+        designator_propname = {}
+        for nm, propclass in klass.getprops().items():
+            if isinstance(propclass, Link) or isinstance(propclass, Multilink):
+                designator_propname[propclass.classname] = nm
+
+        # build a dictionary of nodes and their associated messages
+        # and files
+        nodeids = {}    # this is the answer
+        propspec = {}     # used to do the klass.find
+        for propname in designator_propname.values():
+            propspec[propname] = {}   # used as a set (value doesn't matter)
+        for classname, nodeid, property in hits.values():
+            # skip this result if we don't care about this class/property
+            if ignore.has_key((classname, property)):
+                continue
  
-    def load_index(self, INDEXDB=None, reload=0, wordlist=None):
-        raise NotImplementedError
+            # if it's a property on klass, it's easy
+            if classname == klass.classname:
+                if not nodeids.has_key(nodeid):
+                    nodeids[nodeid] = {}
+                continue
  
-    def find(self, wordlist, print_report=0):
-        "Locate files that match ALL the words in wordlist"
+            # it's a linked class - set up to do the klass.find
+            linkprop = designator_propname[classname]   # eg, msg -> messages
+            propspec[linkprop][nodeid] = 1
+
+        # retain only the meaningful entries
+        for propname, idset in propspec.items():
+            if not idset:
+                del propspec[propname]
+        
+        # klass.find tells me the klass nodeids the linked nodes relate to
+        for resid in klass.find(**propspec):
+            resid = str(resid)
+            if not nodeids.has_key(id):
+                nodeids[resid] = {}
+            node_dict = nodeids[resid]
+            # now figure out where it came from
+            for linkprop in propspec.keys():
+                for nodeid in klass.get(resid, linkprop):
+                    if propspec[linkprop].has_key(nodeid):
+                        # OK, this node[propname] has a winner
+                        if not node_dict.has_key(linkprop):
+                            node_dict[linkprop] = [nodeid]
+                        else:
+                            node_dict[linkprop].append(nodeid)
+        return nodeids
+
+    # we override this to ignore not 2 < word < 25 and also to fix a bug -
+    # the (fail) case.
+    def find(self, wordlist):
+        ''' Locate files that match ALL the words in wordlist
+        '''
+        if not hasattr(self, 'words'):
+            self.load_index()
          self.load_index(wordlist=wordlist)
          entries = {}
-        hits = copy.copy(self.fileids)      # Copy of fileids index
+        hits = None
          for word in wordlist:
-            if not self.casesensitive:
-                word = string.upper(word)
+            if not 2 < len(word) < 25:
+                # word outside the bounds of what we index - ignore
+                continue
+            word = word.upper()
              entry = self.words.get(word)    # For each word, get index
              entries[word] = entry           #   of matching files
              if not entry:                   # Nothing for this one word (fail)
-                return 0
-            for fileid in hits.keys():      # Eliminate hits for every non-match
-                if not entry.has_key(fileid):
-                    del hits[fileid]
-        if print_report:
-            self.print_report(hits, wordlist, entries)
-        return hits
-
-    def print_report(self, hits={}, wordlist=[], entries={}):
-        # Figure out what to actually print (based on QUIET level)
-        output = []
-        for fileid,fname in hits.items():
-            message = fname
-            if self.quiet <= 3:
-                wordcount = self.files[fname][1]
-                matches = 0
-                countmess = '\n'+' '*13+`wordcount`+' words; '
-                for word in wordlist:
-                    if not self.casesensitive:
-                        word = string.upper(word)
-                    occurs = entries[word][fileid]
-                    matches = matches+occurs
-                    countmess = countmess +`occurs`+' '+word+'; '
-                message = string.ljust('[RATING: '
-                                       +`1000*matches/wordcount`+']',13)+message
-                if self.quiet <= 2: message = message +countmess +'\n'
-            if self.filter:     # Using an output filter
-                if fnmatch.fnmatch(message, self.filter):
-                    output.append(message)
+                return {}
+            if hits is None:
+                hits = {}
+                for k in entry.keys():
+                    hits[k] = self.fileids[k]
              else:
-                output.append(message)
-
-        if self.quiet <= 5:
-            print string.join(output,'\n')
-        sys.stderr.write('\n'+`len(output)`+' files matched wordlist: '+
-                         `wordlist`+'\n')
-        return output
-
-    def purge_entry(self, fname, file_dct, word_dct):
-        "Remove a file from file index and word index"
-        try:        # The easy part, cleanup the file index
-            file_index = file_dct[fname]
-            del file_dct[fname]
-        except KeyError:
-            pass    # We'll assume we only encounter KeyError's
-        # The much harder part, cleanup the word index
-        for word, occurs in word_dct.items():
-            if occurs.has_key(file_index):
-                del occurs[file_index]
-                word_dct[word] = occurs
+                # Eliminate hits for every non-match
+                for fileid in hits.keys():
+                    if not entry.has_key(fileid):
+                        del hits[fileid]
+        if hits is None:
+            return {}
+        return hits
  
-    def index_loaded(self):
-        return ( hasattr(self,'fileids') and
-                 hasattr(self,'files')   and
-                 hasattr(self,'words')      )
-
-#-- Provide an actual storage facility for the indexes (i.e. shelve)
-class ShelveIndexer(GenericIndexer, TextSplitter):
-    """Concrete Indexer utilizing [shelve] for storage
-
-    Unfortunately, [shelve] proves far too slow in indexing, while
-    creating monstrously large indexes.  Not recommend, at least under
-    the default dbm's tested.  Also, class may be broken because
-    shelves do not, apparently, support the .values() and .items()
-    methods.  Fixing this is a low priority, but the sample code is
-    left here.
-    """
-    def load_index(self, INDEXDB=None, reload=0, wordlist=None):
-        INDEXDB = INDEXDB or self.indexdb
-        import shelve
-        self.words   = shelve.open(INDEXDB+".WORDS")
-        self.files   = shelve.open(INDEXDB+".FILES")
-        self.fileids = shelve.open(INDEXDB+".FILEIDS")
-        if not FILES:            # New index
-            self.files['_TOP'] = (0,None)
-
-    def save_index(self, INDEXDB=None):
-        INDEXDB = INDEXDB or self.indexdb
-        pass
-
-class FlatIndexer(GenericIndexer, TextSplitter):
-    """Concrete Indexer utilizing flat-file for storage
-
-    See the comments in the referenced article for details; in
-    brief, this indexer has about the same timing as the best in
-    -creating- indexes and the storage requirements are
-    reasonable.  However, actually -using- a flat-file index is
-    more than an order of magnitude worse than the best indexer
-    (ZPickleIndexer wins overall).
-
-    On the other hand, FlatIndexer creates a wonderfully easy to
-    parse database format if you have a reason to transport the
-    index to a different platform or programming language.  And
-    should you perform indexing as part of a long-running
-    process, the overhead of initial file parsing becomes
-    irrelevant.
-    """
-    def load_index(self, INDEXDB=None, reload=0, wordlist=None):
-        # Unless reload is indicated, do not load twice
-        if self.index_loaded() and not reload: return 0
-        # Ok, now let's actually load it
-        INDEXDB = INDEXDB or self.indexdb
-        self.words = {}
-        self.files = {'_TOP':(0,None)}
-        self.fileids = {}
-        try:                            # Read index contents
-            for line in open(INDEXDB).readlines():
-                fields = string.split(line)
-                if fields[0] == '-':    # Read a file/fileid line
-                    fileid = eval(fields[2])
-                    wordcount = eval(fields[3])
-                    fname = fields[1]
-                    self.files[fname] = (fileid, wordcount)
-                    self.fileids[fileid] = fname
-                else:                   # Read a word entry (dict of hits)
-                    entries = {}
-                    word = fields[0]
-                    for n in range(1,len(fields),2):
-                        fileid = eval(fields[n])
-                        occurs = eval(fields[n+1])
-                        entries[fileid] = occurs
-                    self.words[word] = entries
-        except:
-            pass                    # New index
-
-    def save_index(self, INDEXDB=None):
-        INDEXDB = INDEXDB or self.indexdb
-        tab, lf, sp = '\t','\n',' '
-        indexdb = open(INDEXDB,'w')
-        for fname,entry in self.files.items():
-            indexdb.write('- '+fname +tab +`entry[0]` +tab +`entry[1]` +lf)
-        for word,entry in self.words.items():
-            indexdb.write(word +tab+tab)
-            for fileid,occurs in entry.items():
-                indexdb.write(`fileid` +sp +`occurs` +sp)
-            indexdb.write(lf)
-
-class PickleIndexer(GenericIndexer, TextSplitter):
-    def load_index(self, INDEXDB=None, reload=0, wordlist=None):
-        # Unless reload is indicated, do not load twice
-        if self.index_loaded() and not reload: return 0
-        # Ok, now let's actually load it
-        import cPickle
-        INDEXDB = INDEXDB or self.indexdb
-        try:
-            pickle_str =  open(INDEXDB,'rb').read()
-            db = cPickle.loads(pickle_str)
-        except:                     # New index
-            db = Index({}, {'_TOP':(0,None)}, {})
-        self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS
-
-    def save_index(self, INDEXDB=None):
-        import cPickle
-        INDEXDB = INDEXDB or self.indexdb
-        db = Index(self.words, self.files, self.fileids)
-        open(INDEXDB,'wb').write(cPickle.dumps(db, 1))
-
-class XMLPickleIndexer(PickleIndexer):
-    """Concrete Indexer utilizing XML for storage
-
-    While this is, as expected, a verbose format, the possibility
-    of using XML as a transport format for indexes might be
-    useful.  However, [xml_pickle] is in need of some redesign to
-    avoid gross inefficiency when creating very large
-    (multi-megabyte) output files (fixed in [xml_pickle] version
-    0.48 or above)
-    """
-    def load_index(self, INDEXDB=None, reload=0, wordlist=None):
-        # Unless reload is indicated, do not load twice
-        if self.index_loaded() and not reload: return 0
-        # Ok, now let's actually load it
-        from gnosis.xml.pickle import XML_Pickler
-        INDEXDB = INDEXDB or self.indexdb
-        try:                        # XML file exists
-            xml_str = open(INDEXDB).read()
-            db = XML_Pickler().loads(xml_str)
-        except:                     # New index
-            db = Index({}, {'_TOP':(0,None)}, {})
-        self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS
-
-    def save_index(self, INDEXDB=None):
-        from gnosis.xml.pickle import XML_Pickler
-        INDEXDB = INDEXDB or self.indexdb
-        db = Index(self.words, self.files, self.fileids)
-        open(INDEXDB,'w').write(XML_Pickler(db).dumps())
-
-class ZPickleIndexer(PickleIndexer):
-    def load_index(self, INDEXDB=None, reload=0, wordlist=None):
-        # Unless reload is indicated, do not load twice
-        if self.index_loaded() and not reload: return 0
-        # Ok, now let's actually load it
-        import cPickle, zlib
-        INDEXDB = INDEXDB or self.indexdb
-        try:
-            pickle_str =  zlib.decompress(open(INDEXDB+'!','rb').read())
-            db = cPickle.loads(pickle_str)
-        except:                     # New index
-            db = Index({}, {'_TOP':(0,None)}, {})
-        self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS
-
-    def save_index(self, INDEXDB=None):
-        import cPickle, zlib
-        INDEXDB = INDEXDB or self.indexdb
-        db = Index(self.words, self.files, self.fileids)
-        pickle_fh = open(INDEXDB+'!','wb')
-        pickle_fh.write(zlib.compress(cPickle.dumps(db, 1)))
-
-
-class SlicedZPickleIndexer(ZPickleIndexer):
-    segments = "ABCDEFGHIJKLMNOPQRSTUVWXYZ#-!"
-    def load_index(self, INDEXDB=None, reload=0, wordlist=None):
+    segments = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_-!"
+    def load_index(self, reload=0, wordlist=None):
          # Unless reload is indicated, do not load twice
-        if self.index_loaded() and not reload: return 0
+        if self.index_loaded() and not reload:
+            return 0
+
          # Ok, now let's actually load it
-        import cPickle, zlib
-        INDEXDB = INDEXDB or self.indexdb
-        db = Index({}, {'_TOP':(0,None)}, {})
+        db = {'WORDS': {}, 'FILES': {'_TOP':(0,None)}, 'FILEIDS': {}}
+
          # Identify the relevant word-dictionary segments
          if not wordlist:
              segments = self.segments
          else:
              segments = ['-','#']
              for word in wordlist:
-                segments.append(string.upper(word[0]))
+                segments.append(word[0].upper())
+
          # Load the segments
          for segment in segments:
              try:
-                pickle_str = zlib.decompress(open(INDEXDB+segment,'rb').read())
-                dbslice = cPickle.loads(pickle_str)
-                if dbslice.__dict__.get('WORDS'):   # If it has some words, add them
-                    for word,entry in dbslice.WORDS.items():
-                        db.WORDS[word] = entry
-                if dbslice.__dict__.get('FILES'):   # If it has some files, add them
-                    db.FILES = dbslice.FILES
-                if dbslice.__dict__.get('FILEIDS'): # If it has fileids, add them
-                    db.FILEIDS = dbslice.FILEIDS
-            except:
-                pass    # No biggie, couldn't find this segment
-        self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS
-
-    def julienne(self, INDEXDB=None):
-        import cPickle, zlib
-        INDEXDB = INDEXDB or self.indexdb
-        segments = self.segments       # all the (little) indexes
-        for segment in segments:
-            try:        # brutal space saver... delete all the small segments
-                os.remove(INDEXDB+segment)
-            except OSError:
-                pass    # probably just nonexistent segment index file
+                f = open(self.indexdb + segment, 'rb')
+            except IOError, error:
+                # probably just nonexistent segment index file
+                if error.errno != errno.ENOENT: raise
+            else:
+                pickle_str = zlib.decompress(f.read())
+                f.close()
+                dbslice = marshal.loads(pickle_str)
+                if dbslice.get('WORDS'):
+                    # if it has some words, add them
+                    for word, entry in dbslice['WORDS'].items():
+                        db['WORDS'][word] = entry
+                if dbslice.get('FILES'):
+                    # if it has some files, add them
+                    db['FILES'] = dbslice['FILES']
+                if dbslice.get('FILEIDS'):
+                    # if it has fileids, add them
+                    db['FILEIDS'] = dbslice['FILEIDS']
+
+        self.words = db['WORDS']
+        self.files = db['FILES']
+        self.fileids = db['FILEIDS']
+        self.changed = 0
+
+    def save_index(self):
+        # only save if the index is loaded and changed
+        if not self.index_loaded() or not self.changed:
+            return
+
+        # brutal space saver... delete all the small segments
+        for segment in self.segments:
+            try:
+                os.remove(self.indexdb + segment)
+            except OSError, error:
+                # probably just nonexistent segment index file
+                if error.errno != errno.ENOENT: raise
+
          # First write the much simpler filename/fileid dictionaries
-        dbfil = Index(None, self.files, self.fileids)
-        open(INDEXDB+'-','wb').write(zlib.compress(cPickle.dumps(dbfil,1)))
+        dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids}
+        open(self.indexdb+'-','wb').write(zlib.compress(marshal.dumps(dbfil)))
+
          # The hard part is splitting the word dictionary up, of course
-        letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+        letters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_"
          segdicts = {}                           # Need batch of empty dicts
-        for segment in letters+'#':
+        for segment in letters:
              segdicts[segment] = {}
          for word, entry in self.words.items():  # Split into segment dicts
-            initchar = string.upper(word[0])
-            if initchar in letters:
-                segdicts[initchar][word] = entry
-            else:
-                segdicts['#'][word] = entry
-        for initchar in letters+'#':
-            db = Index(segdicts[initchar], None, None)
-            pickle_str = cPickle.dumps(db, 1)
-            filename = INDEXDB+initchar
-            pickle_fh = open(filename,'wb')
+            initchar = word[0].upper()
+            segdicts[initchar][word] = entry
+
+        # save
+        for initchar in letters:
+            db = {'WORDS':segdicts[initchar], 'FILES':None, 'FILEIDS':None}
+            pickle_str = marshal.dumps(db)
+            filename = self.indexdb + initchar
+            pickle_fh = open(filename, 'wb')
              pickle_fh.write(zlib.compress(pickle_str))
-            os.chmod(filename,0664)
-
-    save_index = julienne
-
-PreferredIndexer = SlicedZPickleIndexer
-
-#-- If called from command-line, parse arguments and take actions
-if __name__ == '__main__':
-    import time
-    start = time.time()
-    search_words = []           # Word search list (if specified)
-    opts = 0                    # Any options specified?
-    if len(sys.argv) < 2:
-        pass                    # No options given
-    else:
-        upper = string.upper
-        dir = os.getcwd()       # Default to indexing from current directory
-        descend = 1             # Default to recursive indexing
-        ndx = PreferredIndexer()
-        for opt in sys.argv[1:]:
-            if opt in ('-h','/h','-?','/?','?','--help'):   # help screen
-                print __shell_usage__
-                opts = -1
-                break
-            elif opt[0] in '/-':                            # a switch!
-                opts = opts+1
-                if upper(opt[1:]) == 'INDEX':               # Index files
-                    ndx.quiet = 0
-                    pass     # Use defaults if no other options
-                elif upper(opt[1:]) == 'REINDEX':           # Reindex
-                    ndx.reindex = 1
-                elif upper(opt[1:]) == 'CASESENSITIVE':     # Case sensitive
-                    ndx.casesensitive = 1
-                elif upper(opt[1:]) in ('NORECURSE','LOCAL'): # No recursion
-                    descend = 0
-                elif upper(opt[1:4]) == 'DIR':              # Dir to index
-                    dir = opt[5:]
-                elif upper(opt[1:8]) == 'INDEXDB':          # Index specified
-                    ndx.indexdb = opt[9:]
-                    sys.stderr.write(
-                        "Use of INDEXER_DB environment variable is STRONGLY recommended.\n")
-                elif upper(opt[1:6]) == 'REGEX':            # RegEx files to index
-                    ndx.add_pattern = re.compile(opt[7:])
-                elif upper(opt[1:5]) == 'GLOB':             # Glob files to index
-                    ndx.add_pattern = opt[6:]
-                elif upper(opt[1:7]) in ('OUTPUT','FORMAT'): # How should results look?
-                    opts = opts-1   # this is not an option for indexing purposes
-                    level = upper(opt[8:])
-                    if level in ('ALL','EVERYTHING','VERBOSE', 'MAX'):
-                        ndx.quiet = 0
-                    elif level in ('RATINGS','SCORES','HIGH'):
-                        ndx.quiet = 3
-                    elif level in ('FILENAMES','NAMES','FILES','MID'):
-                        ndx.quiet = 5
-                    elif level in ('SUMMARY','MIN'):
-                        ndx.quiet = 9
-                elif upper(opt[1:7]) == 'FILTER':           # Regex filter output
-                    opts = opts-1   # this is not an option for indexing purposes
-                    ndx.filter = opt[8:]
-                elif opt[1:] in string.digits:
-                    opts = opts-1
-                    ndx.quiet = eval(opt[1])
-            else:
-                search_words.append(opt)                    # Search words
-
-    if opts > 0:
-        ndx.add_files(dir=dir)
-        ndx.save_index()
-    if search_words:
-        ndx.find(search_words, print_report=1)
-    if not opts and not search_words:
-        sys.stderr.write("Perhaps you would like to use the --help option?\n")
-    else:
-        sys.stderr.write('Processed in %.3f seconds (%s)'
-                          % (time.time()-start, ndx.whoami()))
+            os.chmod(filename, 0664)
+
+        # save done
+        self.changed = 0
+
+    def purge_entry(self, identifier):
+        ''' Remove a file from file index and word index
+        '''
+        if not self.files.has_key(identifier):
+            return
+
+        file_index = self.files[identifier][0]
+        del self.files[identifier]
+        del self.fileids[file_index]
+
+        # The much harder part, cleanup the word index
+        for key, occurs in self.words.items():
+            if occurs.has_key(file_index):
+                del occurs[file_index]
+
+        # save needed
+        self.changed = 1
+
+    def index_loaded(self):
+        return (hasattr(self,'fileids') and hasattr(self,'files') and
+            hasattr(self,'words'))
  
  #
  #$Log: not supported by cvs2svn $
-#Revision 1.1.2.3  2002/04/03 12:05:15  rochecompaan
-#Removed dos control characters.
+#Revision 1.9  2002/07/14 06:11:16  richard
+#Some TODOs
+#
+#Revision 1.8  2002/07/09 21:53:38  gmcm
+#Optimize Class.find so that the propspec can contain a set of ids to match.
+#This is used by indexer.search so it can do just one find for all the index matches.
+#This was already confusing code, but for common terms (lots of index matches),
+#it is enormously faster.
+#
+#Revision 1.7  2002/07/09 21:38:43  richard
+#Only save the index if the thing is loaded and changed. Also, don't load
+#the index just for a save.
+#
+#Revision 1.6  2002/07/09 04:26:44  richard
+#We're indexing numbers now, and _underscore words
+#
+#Revision 1.5  2002/07/09 04:19:09  richard
+#Added reindex command to roundup-admin.
+#Fixed reindex on first access.
+#Also fixed reindexing of entries that change.
+#
+#Revision 1.4  2002/07/09 03:02:52  richard
+#More indexer work:
+#- all String properties may now be indexed too. Currently there's a bit of
+#  "issue" specific code in the actual searching which needs to be
+#  addressed. In a nutshell:
+#  + pass 'indexme="yes"' as a String() property initialisation arg, eg:
+#        file = FileClass(db, "file", name=String(), type=String(),
+#            comment=String(indexme="yes"))
+#  + the comment will then be indexed and be searchable, with the results
+#    related back to the issue that the file is linked to
+#- as a result of this work, the FileClass has a default MIME type that may
+#  be overridden in a subclass, or by the use of a "type" property as is
+#  done in the default templates.
+#- the regeneration of the indexes (if necessary) is done once the schema is
+#  set up in the dbinit.
+#
+#Revision 1.3  2002/07/08 06:58:15  richard
+#cleaned up the indexer code:
+# - it splits more words out (much simpler, faster splitter)
+# - removed code we'll never use (roundup.roundup_indexer has the full
+#   implementation, and replaces roundup.indexer)
+# - only index text/plain and rfc822/message (ideas for other text formats to
+#   index are welcome)
+# - added simple unit test for indexer. Needs more tests for regression.
+#
+#Revision 1.2  2002/05/25 07:16:24  rochecompaan
+#Merged search_indexing-branch with HEAD
+#
+#Revision 1.1.2.3  2002/05/02 11:52:12  rochecompaan
+#Fixed small bug that prevented indexes from being generated.
+#
+#Revision 1.1.2.2  2002/04/19 19:54:42  rochecompaan
+#cgi_client.py
+#    removed search link for the time being
+#    moved rendering of matches to htmltemplate
+#hyperdb.py
+#    filtering of nodes on full text search incorporated in filter method
+#roundupdb.py
+#    added paramater to call of filter method
+#roundup_indexer.py
+#    added search method to RoundupIndexer class
  #
-#Revision 1.1.2.2  2002/04/03 12:01:55  rochecompaan
-#Oops. Forgot to include cvs keywords in file.
+#Revision 1.1.2.1  2002/04/03 11:55:57  rochecompaan
+# . Added feature #526730 - search for messages capability
  #