X-Git-Url: https://git.tokkee.org/?a=blobdiff_plain;f=roundup%2Findexer.py;h=4295b70167573e4b76e749abe51a466b1496be12;hb=efc777b7a124948b50aa5fa521cba8bde4486fc1;hp=d09e950d8102a1b6843ea8150d8068f8b491a7ba;hpb=d195729acba7813a8e314a76d3a6ba73f6dc9f1f;p=roundup.git

diff --git a/roundup/indexer.py b/roundup/indexer.py
index d09e950..4295b70 100644
--- a/roundup/indexer.py
+++ b/roundup/indexer.py
@@ -14,13 +14,14 @@
 #     that promote freedom, but obviously am giving up any rights
 #     to compel such.
 # 
-#$Id: indexer.py,v 1.6 2002-07-09 04:26:44 richard Exp $
+#$Id: indexer.py,v 1.13 2002-09-10 00:18:20 richard Exp $
 '''
 This module provides an indexer class, RoundupIndexer, that stores text
 indices in a roundup instance.  This class makes searching the content of
-messages and text files possible.
+messages, string properties and text files possible.
 '''
 import os, shutil, re, mimetypes, marshal, zlib, errno
+from hyperdb import Link, Multilink
 
 class Indexer:
     ''' Indexes information from roundup's hyperdb to allow efficient
@@ -30,20 +31,26 @@ class Indexer:
           files   {identifier: (fileid, wordcount)}
           words   {word: {fileid: count}}
           fileids {fileid: identifier}
+        where identifier is (classname, nodeid, propertyname)
     '''
     def __init__(self, db_path):
         self.indexdb_path = os.path.join(db_path, 'indexes')
         self.indexdb = os.path.join(self.indexdb_path, 'index.db')
         self.reindex = 0
-        self.casesensitive = 0
         self.quiet = 9
+        self.changed = 0
 
         # see if we need to reindex because of a change in code
+        version = os.path.join(self.indexdb_path, 'version')
         if (not os.path.exists(self.indexdb_path) or
-                not os.path.exists(os.path.join(self.indexdb_path, 'version'))):
-            # TODO: if the version file exists (in the future) we'll want to
-            # check the value in it - for now the file itself is a flag
+                not os.path.exists(version)):
+            # for now the file itself is a flag
             self.force_reindex()
+        elif os.path.exists(version):
+            version = open(version).read()
+            # check the value and reindex if it's not the latest
+            if version.strip() != '1':
+                self.force_reindex()
 
     def force_reindex(self):
         '''Force a reindex condition
@@ -54,6 +61,7 @@ class Indexer:
         os.chmod(self.indexdb_path, 0775)
         open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n')
         self.reindex = 1
+        self.changed = 1
 
     def should_reindex(self):
         '''Should we reindex?
@@ -102,21 +110,23 @@ class Indexer:
             # make a reference to the file for this word
             entry[file_index] = filedict[word]
 
+        # save needed
+        self.changed = 1
+
     def splitter(self, text, ftype):
         ''' Split the contents of a text string into a list of 'words'
         '''
         if ftype == 'text/plain':
-            words = self.text_splitter(text, self.casesensitive)
+            words = self.text_splitter(text)
         else:
             return []
         return words
 
-    def text_splitter(self, text, casesensitive=0):
+    def text_splitter(self, text):
         """Split text/plain string into a list of words
         """
-        # Let's adjust case if not case-sensitive
-        if not casesensitive:
-            text = text.upper()
+        # case insensitive
+        text = text.upper()
 
         # Split the raw text, losing anything longer than 25 characters
         # since that'll be gibberish (encoded text or somesuch) or shorter
@@ -136,12 +146,18 @@ class Indexer:
         if not hits:
             return {}
 
-        # this is specific to "issue" klass ... eugh
-        designator_propname = {'msg': 'messages', 'file': 'files'}
+        #designator_propname = {'msg': 'messages', 'file': 'files'}
+        designator_propname = {}
+        for nm, propclass in klass.getprops().items():
+            if isinstance(propclass, Link) or isinstance(propclass, Multilink):
+                designator_propname[propclass.classname] = nm
 
         # build a dictionary of nodes and their associated messages
         # and files
-        nodeids = {}
+        nodeids = {}    # this is the answer
+        propspec = {}     # used to do the klass.find
+        for propname in designator_propname.values():
+            propspec[propname] = {}   # used as a set (value doesn't matter)
         for classname, nodeid, property in hits.values():
             # skip this result if we don't care about this class/property
             if ignore.has_key((classname, property)):
@@ -153,20 +169,30 @@ class Indexer:
                     nodeids[nodeid] = {}
                 continue
 
-            # it's a linked class - find the klass entries that are
-            # linked to it
-            linkprop = designator_propname[classname]
-            for resid in klass.find(**{linkprop: nodeid}):
-                resid = str(resid)
-                if not nodeids.has_key(id):
-                    nodeids[resid] = {}
-
-                # update the links for this klass nodeid
-                node_dict = nodeids[resid]
-                if not node_dict.has_key(linkprop):
-                    node_dict[linkprop] = [nodeid]
-                elif node_dict.has_key(linkprop):
-                    node_dict[linkprop].append(nodeid)
+            # it's a linked class - set up to do the klass.find
+            linkprop = designator_propname[classname]   # eg, msg -> messages
+            propspec[linkprop][nodeid] = 1
+
+        # retain only the meaningful entries
+        for propname, idset in propspec.items():
+            if not idset:
+                del propspec[propname]
+        
+        # klass.find tells me the klass nodeids the linked nodes relate to
+        for resid in klass.find(**propspec):
+            resid = str(resid)
+            if not nodeids.has_key(id):
+                nodeids[resid] = {}
+            node_dict = nodeids[resid]
+            # now figure out where it came from
+            for linkprop in propspec.keys():
+                for nodeid in klass.get(resid, linkprop):
+                    if propspec[linkprop].has_key(nodeid):
+                        # OK, this node[propname] has a winner
+                        if not node_dict.has_key(linkprop):
+                            node_dict[linkprop] = [nodeid]
+                        else:
+                            node_dict[linkprop].append(nodeid)
         return nodeids
 
     # we override this to ignore not 2 < word < 25 and also to fix a bug -
@@ -183,8 +209,7 @@ class Indexer:
             if not 2 < len(word) < 25:
                 # word outside the bounds of what we index - ignore
                 continue
-            if not self.casesensitive:
-                word = word.upper()
+            word = word.upper()
             entry = self.words.get(word)    # For each word, get index
             entries[word] = entry           #   of matching files
             if not entry:                   # Nothing for this one word (fail)
@@ -224,8 +249,8 @@ class Indexer:
             try:
                 f = open(self.indexdb + segment, 'rb')
             except IOError, error:
-                if error.errno != errno.ENOENT:
-                    raise
+                # probably just nonexistent segment index file
+                if error.errno != errno.ENOENT: raise
             else:
                 pickle_str = zlib.decompress(f.read())
                 f.close()
@@ -244,19 +269,20 @@ class Indexer:
         self.words = db['WORDS']
         self.files = db['FILES']
         self.fileids = db['FILEIDS']
+        self.changed = 0
 
     def save_index(self):
-        # make sure we're loaded
-        self.load_index()
+        # only save if the index is loaded and changed
+        if not self.index_loaded() or not self.changed:
+            return
 
         # brutal space saver... delete all the small segments
         for segment in self.segments:
             try:
                 os.remove(self.indexdb + segment)
-            except OSError:
+            except OSError, error:
                 # probably just nonexistent segment index file
-                # TODO: make sure it's an EEXIST
-                pass
+                if error.errno != errno.ENOENT: raise
 
         # First write the much simpler filename/fileid dictionaries
         dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids}
@@ -280,9 +306,14 @@ class Indexer:
             pickle_fh.write(zlib.compress(pickle_str))
             os.chmod(filename, 0664)
 
+        # save done
+        self.changed = 0
+
     def purge_entry(self, identifier):
         ''' Remove a file from file index and word index
         '''
+        self.load_index()
+
         if not self.files.has_key(identifier):
             return
 
@@ -295,59 +326,11 @@ class Indexer:
             if occurs.has_key(file_index):
                 del occurs[file_index]
 
+        # save needed
+        self.changed = 1
+
     def index_loaded(self):
         return (hasattr(self,'fileids') and hasattr(self,'files') and
             hasattr(self,'words'))
 
-#
-#$Log: not supported by cvs2svn $
-#Revision 1.5  2002/07/09 04:19:09  richard
-#Added reindex command to roundup-admin.
-#Fixed reindex on first access.
-#Also fixed reindexing of entries that change.
-#
-#Revision 1.4  2002/07/09 03:02:52  richard
-#More indexer work:
-#- all String properties may now be indexed too. Currently there's a bit of
-#  "issue" specific code in the actual searching which needs to be
-#  addressed. In a nutshell:
-#  + pass 'indexme="yes"' as a String() property initialisation arg, eg:
-#        file = FileClass(db, "file", name=String(), type=String(),
-#            comment=String(indexme="yes"))
-#  + the comment will then be indexed and be searchable, with the results
-#    related back to the issue that the file is linked to
-#- as a result of this work, the FileClass has a default MIME type that may
-#  be overridden in a subclass, or by the use of a "type" property as is
-#  done in the default templates.
-#- the regeneration of the indexes (if necessary) is done once the schema is
-#  set up in the dbinit.
-#
-#Revision 1.3  2002/07/08 06:58:15  richard
-#cleaned up the indexer code:
-# - it splits more words out (much simpler, faster splitter)
-# - removed code we'll never use (roundup.roundup_indexer has the full
-#   implementation, and replaces roundup.indexer)
-# - only index text/plain and rfc822/message (ideas for other text formats to
-#   index are welcome)
-# - added simple unit test for indexer. Needs more tests for regression.
-#
-#Revision 1.2  2002/05/25 07:16:24  rochecompaan
-#Merged search_indexing-branch with HEAD
-#
-#Revision 1.1.2.3  2002/05/02 11:52:12  rochecompaan
-#Fixed small bug that prevented indexes from being generated.
-#
-#Revision 1.1.2.2  2002/04/19 19:54:42  rochecompaan
-#cgi_client.py
-#    removed search link for the time being
-#    moved rendering of matches to htmltemplate
-#hyperdb.py
-#    filtering of nodes on full text search incorporated in filter method
-#roundupdb.py
-#    added paramater to call of filter method
-#roundup_indexer.py
-#    added search method to RoundupIndexer class
-#
-#Revision 1.1.2.1  2002/04/03 11:55:57  rochecompaan
-# . Added feature #526730 - search for messages capability
-#
+# vim: set filetype=python ts=4 sw=4 et si