*** empty log message ***

[roundup.git] / roundup / indexer.py
diff --git a/roundup/indexer.py b/roundup/indexer.py

index 8b2f61562dda3634755f7e1784fc3886c8fb5cdf..74bfcafb9e3176ed17e632e75866354b82d18049 100644 (file)
--- a/roundup/indexer.py
+++ b/roundup/indexer.py
@@ -14,134 +14,73 @@
  #     that promote freedom, but obviously am giving up any rights
  #     to compel such.
  # 
-#$Id: indexer.py,v 1.3 2002-07-08 06:58:15 richard Exp $
+#$Id: indexer.py,v 1.12 2002-07-19 03:36:33 richard Exp $
  '''
  This module provides an indexer class, RoundupIndexer, that stores text
  indices in a roundup instance.  This class makes searching the content of
-messages and text files possible.
+messages, string properties and text files possible.
  '''
  import os, shutil, re, mimetypes, marshal, zlib, errno
+from hyperdb import Link, Multilink
  
  class Indexer:
-    ''' Indexes messages and files.
-
-        This implements a new splitter based on re.findall '\w+' and the
-        add_othertext method.
+    ''' Indexes information from roundup's hyperdb to allow efficient
+        searching.
+
+        Three structures are created by the indexer:
+          files   {identifier: (fileid, wordcount)}
+          words   {word: {fileid: count}}
+          fileids {fileid: identifier}
+        where identifier is (classname, nodeid, propertyname)
      '''
      def __init__(self, db_path):
-        indexdb_path = os.path.join(db_path, 'indexes')
-
-        # see if we need to reindex because of a change in code
-        if (os.path.exists(indexdb_path) and
-                not os.path.exists(os.path.join(indexdb_path, 'version'))):
-            shutil.rmtree(indexdb_path)
-
-        # see if the index exists
-        index_exists = 0
-        if not os.path.exists(indexdb_path):
-            os.makedirs(indexdb_path)
-            os.chmod(indexdb_path, 0775)
-            open(os.path.join(indexdb_path, 'version'), 'w').write('1\n')
-        else:
-            index_exists = 1
-
-        # save off the path to the indexdb
-        self.indexdb = os.path.join(indexdb_path, 'index.db')
+        self.indexdb_path = os.path.join(db_path, 'indexes')
+        self.indexdb = os.path.join(self.indexdb_path, 'index.db')
          self.reindex = 0
-        self.casesensitive = 0
          self.quiet = 9
+        self.changed = 0
  
-        if not index_exists:
-            # index everything
-            files_path = os.path.join(db_path, 'files')
-            self.add_files(dir=files_path)
-            self.save_index()
-
-    # override add_files so it's a little smarter about file types
-    def add_files(self, dir):
-        if not hasattr(self, 'files'):
-            self.load_index()
-        os.path.walk(dir, self.walk_add_file, None)
-        # Rebuild the fileid index
-        self.fileids = {}
-        for fname in self.files.keys():
-            fileid = self.files[fname][0]
-            self.fileids[fileid] = fname
-
-    # override add_file so it can be a little smarter about determining the
-    # file type
-    def walk_add_file(self, arg, dname, names, ftype=None):
-        for name in names:
-            name = os.path.join(dname, name)
-            if os.path.isfile(name):
-                self.add_file(name)
-            elif os.path.isdir(name):
-                os.path.walk(name, self.walk_add_file, None)
-    def add_file(self, fname, ftype=None):
-        ''' Index the contents of a regular file
+        # see if we need to reindex because of a change in code
+        version = os.path.join(self.indexdb_path, 'version')
+        if (not os.path.exists(self.indexdb_path) or
+                not os.path.exists(version)):
+            # for now the file itself is a flag
+            self.force_reindex()
+        elif os.path.exists(version):
+            version = open(version).read()
+            # check the value and reindex if it's not the latest
+            if version.strip() != '1':
+                self.force_reindex()
+
+    def force_reindex(self):
+        '''Force a reindex condition
          '''
-        if not hasattr(self, 'files'):
-            self.load_index()
-        # Is file eligible for (re)indexing?
-        if self.files.has_key(fname):
-            if self.reindex:
-                # Reindexing enabled, cleanup dicts
-                self.purge_entry(fname, self.files, self.words)
-            else:
-                # DO NOT reindex this file
-                if self.quiet < 5:
-                    print "Skipping", fname
-                return 0
-
-        # guess the file type
-        if ftype is None:
-            ftype = mimetypes.guess_type(fname)
-
-        # read in the file
-        text = open(fname).read()
-        if self.quiet < 5: print "Indexing", fname
-        words = self.splitter(text, ftype)
-
-        # Find new file index, and assign it to filename
-        # (_TOP uses trick of negative to avoid conflict with file index)
-        self.files['_TOP'] = (self.files['_TOP'][0]-1, None)
-        file_index =  abs(self.files['_TOP'][0])
-        self.files[fname] = (file_index, len(words))
-
-        filedict = {}
-        for word in words:
-            if filedict.has_key(word):
-                filedict[word] = filedict[word]+1
-            else:
-                filedict[word] = 1
-
-        for word in filedict.keys():
-            if self.words.has_key(word):
-                entry = self.words[word]
-            else:
-                entry = {}
-            entry[file_index] = filedict[word]
-            self.words[word] = entry
+        if os.path.exists(self.indexdb_path):
+            shutil.rmtree(self.indexdb_path)
+        os.makedirs(self.indexdb_path)
+        os.chmod(self.indexdb_path, 0775)
+        open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n')
+        self.reindex = 1
+        self.changed = 1
+
+    def should_reindex(self):
+        '''Should we reindex?
+        '''
+        return self.reindex
  
-    # NOTE: this method signature deviates from the one specified in
-    # indexer - I'm not entirely sure where it was expected to the text
-    # from otherwise...
-    def add_othertext(self, identifier, text):
-        ''' Add some text associated with the identifier
+    def add_text(self, identifier, text, mime_type='text/plain'):
+        ''' Add some text associated with the (classname, nodeid, property)
+            identifier.
          '''
-        # Is file eligible for (re)indexing?
+        # make sure the index is loaded
+        self.load_index()
+
+        # remove old entries for this identifier
          if self.files.has_key(identifier):
-            # Reindexing enabled, cleanup dicts
-            if self.reindex:
-                self.purge_entry(identifier, self.files, self.words)
-            else:
-                # DO NOT reindex this file
-                if self.quiet < 5:
-                    print "Not reindexing", identifier
-                return 0
+            self.purge_entry(identifier)
  
          # split into words
-        words = self.splitter(text, 'text/plain')
+        words = self.splitter(text, mime_type)
  
          # Find new file index, and assign it to identifier
          # (_TOP uses trick of negative to avoid conflict with file index)
@@ -171,21 +110,23 @@ class Indexer:
              # make a reference to the file for this word
              entry[file_index] = filedict[word]
  
+        # save needed
+        self.changed = 1
+
      def splitter(self, text, ftype):
          ''' Split the contents of a text string into a list of 'words'
          '''
-        if ftype in ('text/plain', 'message/rfc822'):
-            words = self.text_splitter(text, self.casesensitive)
+        if ftype == 'text/plain':
+            words = self.text_splitter(text)
          else:
              return []
          return words
  
-    def text_splitter(self, text, casesensitive=0):
+    def text_splitter(self, text):
          """Split text/plain string into a list of words
          """
-        # Let's adjust case if not case-sensitive
-        if not casesensitive:
-            text = text.upper()
+        # case insensitive
+        text = text.upper()
  
          # Split the raw text, losing anything longer than 25 characters
          # since that'll be gibberish (encoded text or somesuch) or shorter
@@ -193,37 +134,65 @@ class Indexer:
          # place
          return re.findall(r'\b\w{2,25}\b', text)
  
-    def search(self, search_terms, klass):
-        ''' display search results
+    def search(self, search_terms, klass, ignore={},
+            dre=re.compile(r'([^\d]+)(\d+)')):
+        ''' Display search results looking for [search, terms] associated
+            with the hyperdb Class "klass". Ignore hits on {class: property}.
+
+            "dre" is a helper, not an argument.
          '''
+        # do the index lookup
          hits = self.find(search_terms)
-        links = []
-        nodeids = {}
-        designator_propname = {'msg': 'messages', 'file': 'files'}
-        if hits:
-            hitcount = len(hits)
-            # build a dictionary of nodes and their associated messages
-            # and files
-            for hit in hits.keys():
-                filename = hits[hit].split('/')[-1]
-                for designator, propname in designator_propname.items():
-                    if not filename.startswith(designator):
-                        continue
-                    nodeid = filename[len(designator):]
-                    result = apply(klass.find, (), {propname:nodeid})
-                    if not result:
-                        continue
-
-                    id = str(result[0])
-                    if not nodeids.has_key(id):
-                        nodeids[id] = {}
-
-                    node_dict = nodeids[id]
-                    if not node_dict.has_key(propname):
-                        node_dict[propname] = [nodeid]
-                    elif node_dict.has_key(propname):
-                        node_dict[propname].append(nodeid)
+        if not hits:
+            return {}
  
+        #designator_propname = {'msg': 'messages', 'file': 'files'}
+        designator_propname = {}
+        for nm, propclass in klass.getprops().items():
+            if isinstance(propclass, Link) or isinstance(propclass, Multilink):
+                designator_propname[propclass.classname] = nm
+
+        # build a dictionary of nodes and their associated messages
+        # and files
+        nodeids = {}    # this is the answer
+        propspec = {}     # used to do the klass.find
+        for propname in designator_propname.values():
+            propspec[propname] = {}   # used as a set (value doesn't matter)
+        for classname, nodeid, property in hits.values():
+            # skip this result if we don't care about this class/property
+            if ignore.has_key((classname, property)):
+                continue
+
+            # if it's a property on klass, it's easy
+            if classname == klass.classname:
+                if not nodeids.has_key(nodeid):
+                    nodeids[nodeid] = {}
+                continue
+
+            # it's a linked class - set up to do the klass.find
+            linkprop = designator_propname[classname]   # eg, msg -> messages
+            propspec[linkprop][nodeid] = 1
+
+        # retain only the meaningful entries
+        for propname, idset in propspec.items():
+            if not idset:
+                del propspec[propname]
+        
+        # klass.find tells me the klass nodeids the linked nodes relate to
+        for resid in klass.find(**propspec):
+            resid = str(resid)
+            if not nodeids.has_key(id):
+                nodeids[resid] = {}
+            node_dict = nodeids[resid]
+            # now figure out where it came from
+            for linkprop in propspec.keys():
+                for nodeid in klass.get(resid, linkprop):
+                    if propspec[linkprop].has_key(nodeid):
+                        # OK, this node[propname] has a winner
+                        if not node_dict.has_key(linkprop):
+                            node_dict[linkprop] = [nodeid]
+                        else:
+                            node_dict[linkprop].append(nodeid)
          return nodeids
  
      # we override this to ignore not 2 < word < 25 and also to fix a bug -
@@ -240,8 +209,7 @@ class Indexer:
              if not 2 < len(word) < 25:
                  # word outside the bounds of what we index - ignore
                  continue
-            if not self.casesensitive:
-                word = word.upper()
+            word = word.upper()
              entry = self.words.get(word)    # For each word, get index
              entries[word] = entry           #   of matching files
              if not entry:                   # Nothing for this one word (fail)
@@ -259,7 +227,7 @@ class Indexer:
              return {}
          return hits
  
-    segments = "ABCDEFGHIJKLMNOPQRSTUVWXYZ#-!"
+    segments = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_-!"
      def load_index(self, reload=0, wordlist=None):
          # Unless reload is indicated, do not load twice
          if self.index_loaded() and not reload:
@@ -281,8 +249,8 @@ class Indexer:
              try:
                  f = open(self.indexdb + segment, 'rb')
              except IOError, error:
-                if error.errno != errno.ENOENT:
-                    raise
+                # probably just nonexistent segment index file
+                if error.errno != errno.ENOENT: raise
              else:
                  pickle_str = zlib.decompress(f.read())
                  f.close()
@@ -301,23 +269,27 @@ class Indexer:
          self.words = db['WORDS']
          self.files = db['FILES']
          self.fileids = db['FILEIDS']
+        self.changed = 0
  
      def save_index(self):
+        # only save if the index is loaded and changed
+        if not self.index_loaded() or not self.changed:
+            return
+
          # brutal space saver... delete all the small segments
          for segment in self.segments:
              try:
                  os.remove(self.indexdb + segment)
-            except OSError:
+            except OSError, error:
                  # probably just nonexistent segment index file
-                # TODO: make sure it's an EEXIST
-                pass
+                if error.errno != errno.ENOENT: raise
  
          # First write the much simpler filename/fileid dictionaries
          dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids}
          open(self.indexdb+'-','wb').write(zlib.compress(marshal.dumps(dbfil)))
  
          # The hard part is splitting the word dictionary up, of course
-        letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ#"
+        letters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_"
          segdicts = {}                           # Need batch of empty dicts
          for segment in letters:
              segdicts[segment] = {}
@@ -334,19 +306,28 @@ class Indexer:
              pickle_fh.write(zlib.compress(pickle_str))
              os.chmod(filename, 0664)
  
-    def purge_entry(self, fname, file_dct, word_dct):
+        # save done
+        self.changed = 0
+
+    def purge_entry(self, identifier):
          ''' Remove a file from file index and word index
          '''
-        try:        # The easy part, cleanup the file index
-            file_index = file_dct[fname]
-            del file_dct[fname]
-        except KeyError:
-            pass    # We'll assume we only encounter KeyError's
+        self.load_index()
+
+        if not self.files.has_key(identifier):
+            return
+
+        file_index = self.files[identifier][0]
+        del self.files[identifier]
+        del self.fileids[file_index]
+
          # The much harder part, cleanup the word index
-        for word, occurs in word_dct.items():
+        for key, occurs in self.words.items():
              if occurs.has_key(file_index):
                  del occurs[file_index]
-                word_dct[word] = occurs
+
+        # save needed
+        self.changed = 1
  
      def index_loaded(self):
          return (hasattr(self,'fileids') and hasattr(self,'files') and
@@ -354,6 +335,60 @@ class Indexer:
  
  #
  #$Log: not supported by cvs2svn $
+#Revision 1.11  2002/07/18 11:17:30  gmcm
+#Add Number and Boolean types to hyperdb.
+#Add conversion cases to web, mail & admin interfaces.
+#Add storage/serialization cases to back_anydbm & back_metakit.
+#
+#Revision 1.10  2002/07/14 23:17:24  richard
+#oops
+#
+#Revision 1.9  2002/07/14 06:11:16  richard
+#Some TODOs
+#
+#Revision 1.8  2002/07/09 21:53:38  gmcm
+#Optimize Class.find so that the propspec can contain a set of ids to match.
+#This is used by indexer.search so it can do just one find for all the index matches.
+#This was already confusing code, but for common terms (lots of index matches),
+#it is enormously faster.
+#
+#Revision 1.7  2002/07/09 21:38:43  richard
+#Only save the index if the thing is loaded and changed. Also, don't load
+#the index just for a save.
+#
+#Revision 1.6  2002/07/09 04:26:44  richard
+#We're indexing numbers now, and _underscore words
+#
+#Revision 1.5  2002/07/09 04:19:09  richard
+#Added reindex command to roundup-admin.
+#Fixed reindex on first access.
+#Also fixed reindexing of entries that change.
+#
+#Revision 1.4  2002/07/09 03:02:52  richard
+#More indexer work:
+#- all String properties may now be indexed too. Currently there's a bit of
+#  "issue" specific code in the actual searching which needs to be
+#  addressed. In a nutshell:
+#  + pass 'indexme="yes"' as a String() property initialisation arg, eg:
+#        file = FileClass(db, "file", name=String(), type=String(),
+#            comment=String(indexme="yes"))
+#  + the comment will then be indexed and be searchable, with the results
+#    related back to the issue that the file is linked to
+#- as a result of this work, the FileClass has a default MIME type that may
+#  be overridden in a subclass, or by the use of a "type" property as is
+#  done in the default templates.
+#- the regeneration of the indexes (if necessary) is done once the schema is
+#  set up in the dbinit.
+#
+#Revision 1.3  2002/07/08 06:58:15  richard
+#cleaned up the indexer code:
+# - it splits more words out (much simpler, faster splitter)
+# - removed code we'll never use (roundup.roundup_indexer has the full
+#   implementation, and replaces roundup.indexer)
+# - only index text/plain and rfc822/message (ideas for other text formats to
+#   index are welcome)
+# - added simple unit test for indexer. Needs more tests for regression.
+#
  #Revision 1.2  2002/05/25 07:16:24  rochecompaan
  #Merged search_indexing-branch with HEAD
  #