Forward-porting of fixes from the maintenance branch.

[roundup.git] / roundup / indexer.py
diff --git a/roundup/indexer.py b/roundup/indexer.py

index 35e5a2990ba0f6da40b5171fcb977d4bc3f9ecfa..4cd5d24ad1b763e5571f1d4bc25233a22715594d 100644 (file)
--- a/roundup/indexer.py
+++ b/roundup/indexer.py
@@ -14,24 +14,27 @@
  #     that promote freedom, but obviously am giving up any rights
  #     to compel such.
  # 
-#$Id: indexer.py,v 1.11 2002-07-18 11:17:30 gmcm Exp $
-'''
-This module provides an indexer class, RoundupIndexer, that stores text
+#$Id: indexer.py,v 1.18 2004-02-11 23:55:08 richard Exp $
+'''This module provides an indexer class, RoundupIndexer, that stores text
  indices in a roundup instance.  This class makes searching the content of
  messages, string properties and text files possible.
  '''
+__docformat__ = 'restructuredtext'
+
  import os, shutil, re, mimetypes, marshal, zlib, errno
  from hyperdb import Link, Multilink
  
  class Indexer:
-    ''' Indexes information from roundup's hyperdb to allow efficient
-        searching.
+    '''Indexes information from roundup's hyperdb to allow efficient
+    searching.
+
+    Three structures are created by the indexer::
  
-        Three structures are created by the indexer:
            files   {identifier: (fileid, wordcount)}
            words   {word: {fileid: count}}
            fileids {fileid: identifier}
-        where identifier is (classname, nodeid, propertyname)
+
+    where identifier is (classname, nodeid, propertyname)
      '''
      def __init__(self, db_path):
          self.indexdb_path = os.path.join(db_path, 'indexes')
@@ -69,8 +72,8 @@ class Indexer:
          return self.reindex
  
      def add_text(self, identifier, text, mime_type='text/plain'):
-        ''' Add some text associated with the (classname, nodeid, property)
-            identifier.
+        '''Add some text associated with the (classname, nodeid, property)
+        identifier.
          '''
          # make sure the index is loaded
          self.load_index()
@@ -114,7 +117,7 @@ class Indexer:
          self.changed = 1
  
      def splitter(self, text, ftype):
-        ''' Split the contents of a text string into a list of 'words'
+        '''Split the contents of a text string into a list of 'words'
          '''
          if ftype == 'text/plain':
              words = self.text_splitter(text)
@@ -136,17 +139,16 @@ class Indexer:
  
      def search(self, search_terms, klass, ignore={},
              dre=re.compile(r'([^\d]+)(\d+)')):
-        ''' Display search results looking for [search, terms] associated
-            with the hyperdb Class "klass". Ignore hits on {class: property}.
+        '''Display search results looking for [search, terms] associated
+        with the hyperdb Class "klass". Ignore hits on {class: property}.
  
-            "dre" is a helper, not an argument.
+        "dre" is a helper, not an argument.
          '''
          # do the index lookup
          hits = self.find(search_terms)
          if not hits:
              return {}
  
-        #designator_propname = {'msg': 'messages', 'file': 'files'}
          designator_propname = {}
          for nm, propclass in klass.getprops().items():
              if isinstance(propclass, Link) or isinstance(propclass, Multilink):
@@ -154,7 +156,7 @@ class Indexer:
  
          # build a dictionary of nodes and their associated messages
          # and files
-        nodeids = {}    # this is the answer
+        nodeids = {}      # this is the answer
          propspec = {}     # used to do the klass.find
          for propname in designator_propname.values():
              propspec[propname] = {}   # used as a set (value doesn't matter)
@@ -169,6 +171,10 @@ class Indexer:
                      nodeids[nodeid] = {}
                  continue
  
+            # make sure the class is a linked one, otherwise ignore
+            if not designator_propname.has_key(classname):
+                continue
+
              # it's a linked class - set up to do the klass.find
              linkprop = designator_propname[classname]   # eg, msg -> messages
              propspec[linkprop][nodeid] = 1
@@ -198,7 +204,7 @@ class Indexer:
      # we override this to ignore not 2 < word < 25 and also to fix a bug -
      # the (fail) case.
      def find(self, wordlist):
-        ''' Locate files that match ALL the words in wordlist
+        '''Locate files that match ALL the words in wordlist
          '''
          if not hasattr(self, 'words'):
              self.load_index()
@@ -217,6 +223,8 @@ class Indexer:
              if hits is None:
                  hits = {}
                  for k in entry.keys():
+                    if not self.fileids.has_key(k):
+                        raise ValueError, 'Index is corrupted: re-generate it'
                      hits[k] = self.fileids[k]
              else:
                  # Eliminate hits for every non-match
@@ -310,8 +318,10 @@ class Indexer:
          self.changed = 0
  
      def purge_entry(self, identifier):
-        ''' Remove a file from file index and word index
+        '''Remove a file from file index and word index
          '''
+        self.load_index()
+
          if not self.files.has_key(identifier):
              return
  
@@ -331,74 +341,4 @@ class Indexer:
          return (hasattr(self,'fileids') and hasattr(self,'files') and
              hasattr(self,'words'))
  
-#
-#$Log: not supported by cvs2svn $
-#Revision 1.10  2002/07/14 23:17:24  richard
-#oops
-#
-#Revision 1.9  2002/07/14 06:11:16  richard
-#Some TODOs
-#
-#Revision 1.8  2002/07/09 21:53:38  gmcm
-#Optimize Class.find so that the propspec can contain a set of ids to match.
-#This is used by indexer.search so it can do just one find for all the index matches.
-#This was already confusing code, but for common terms (lots of index matches),
-#it is enormously faster.
-#
-#Revision 1.7  2002/07/09 21:38:43  richard
-#Only save the index if the thing is loaded and changed. Also, don't load
-#the index just for a save.
-#
-#Revision 1.6  2002/07/09 04:26:44  richard
-#We're indexing numbers now, and _underscore words
-#
-#Revision 1.5  2002/07/09 04:19:09  richard
-#Added reindex command to roundup-admin.
-#Fixed reindex on first access.
-#Also fixed reindexing of entries that change.
-#
-#Revision 1.4  2002/07/09 03:02:52  richard
-#More indexer work:
-#- all String properties may now be indexed too. Currently there's a bit of
-#  "issue" specific code in the actual searching which needs to be
-#  addressed. In a nutshell:
-#  + pass 'indexme="yes"' as a String() property initialisation arg, eg:
-#        file = FileClass(db, "file", name=String(), type=String(),
-#            comment=String(indexme="yes"))
-#  + the comment will then be indexed and be searchable, with the results
-#    related back to the issue that the file is linked to
-#- as a result of this work, the FileClass has a default MIME type that may
-#  be overridden in a subclass, or by the use of a "type" property as is
-#  done in the default templates.
-#- the regeneration of the indexes (if necessary) is done once the schema is
-#  set up in the dbinit.
-#
-#Revision 1.3  2002/07/08 06:58:15  richard
-#cleaned up the indexer code:
-# - it splits more words out (much simpler, faster splitter)
-# - removed code we'll never use (roundup.roundup_indexer has the full
-#   implementation, and replaces roundup.indexer)
-# - only index text/plain and rfc822/message (ideas for other text formats to
-#   index are welcome)
-# - added simple unit test for indexer. Needs more tests for regression.
-#
-#Revision 1.2  2002/05/25 07:16:24  rochecompaan
-#Merged search_indexing-branch with HEAD
-#
-#Revision 1.1.2.3  2002/05/02 11:52:12  rochecompaan
-#Fixed small bug that prevented indexes from being generated.
-#
-#Revision 1.1.2.2  2002/04/19 19:54:42  rochecompaan
-#cgi_client.py
-#    removed search link for the time being
-#    moved rendering of matches to htmltemplate
-#hyperdb.py
-#    filtering of nodes on full text search incorporated in filter method
-#roundupdb.py
-#    added paramater to call of filter method
-#roundup_indexer.py
-#    added search method to RoundupIndexer class
-#
-#Revision 1.1.2.1  2002/04/03 11:55:57  rochecompaan
-# . Added feature #526730 - search for messages capability
-#
+# vim: set filetype=python ts=4 sw=4 et si