diff --git a/roundup/indexer.py b/roundup/indexer.py
index a1edc712d848c6e8c97689ae2d9dbd8f768f1e5c..e4d1d59b02c186e522b9b3addc707bd52ffe7c6a 100644 (file)
--- a/roundup/indexer.py
+++ b/roundup/indexer.py
# that promote freedom, but obviously am giving up any rights
# to compel such.
#
-#$Id: indexer.py,v 1.7 2002-07-09 21:38:43 richard Exp $
+#$Id: indexer.py,v 1.14 2002-09-25 05:06:14 richard Exp $
'''
This module provides an indexer class, RoundupIndexer, that stores text
indices in a roundup instance. This class makes searching the content of
-messages and text files possible.
+messages, string properties and text files possible.
'''
import os, shutil, re, mimetypes, marshal, zlib, errno
+from hyperdb import Link, Multilink
class Indexer:
''' Indexes information from roundup's hyperdb to allow efficient
files {identifier: (fileid, wordcount)}
words {word: {fileid: count}}
fileids {fileid: identifier}
+ where identifier is (classname, nodeid, propertyname)
'''
def __init__(self, db_path):
self.indexdb_path = os.path.join(db_path, 'indexes')
self.changed = 0
# see if we need to reindex because of a change in code
+ version = os.path.join(self.indexdb_path, 'version')
if (not os.path.exists(self.indexdb_path) or
- not os.path.exists(os.path.join(self.indexdb_path, 'version'))):
- # TODO: if the version file exists (in the future) we'll want to
- # check the value in it - for now the file itself is a flag
+ not os.path.exists(version)):
+ # for now the file itself is a flag
self.force_reindex()
+ elif os.path.exists(version):
+ version = open(version).read()
+ # check the value and reindex if it's not the latest
+ if version.strip() != '1':
+ self.force_reindex()
def force_reindex(self):
'''Force a reindex condition
if not hits:
return {}
- # this is specific to "issue" klass ... eugh
- designator_propname = {'msg': 'messages', 'file': 'files'}
+ #designator_propname = {'msg': 'messages', 'file': 'files'}
+ designator_propname = {}
+ for nm, propclass in klass.getprops().items():
+ if isinstance(propclass, Link) or isinstance(propclass, Multilink):
+ designator_propname[propclass.classname] = nm
# build a dictionary of nodes and their associated messages
# and files
- nodeids = {}
+ nodeids = {} # this is the answer
+ propspec = {} # used to do the klass.find
+ for propname in designator_propname.values():
+ propspec[propname] = {} # used as a set (value doesn't matter)
for classname, nodeid, property in hits.values():
# skip this result if we don't care about this class/property
if ignore.has_key((classname, property)):
nodeids[nodeid] = {}
continue
- # it's a linked class - find the klass entries that are
- # linked to it
- linkprop = designator_propname[classname]
- for resid in klass.find(**{linkprop: nodeid}):
- resid = str(resid)
- if not nodeids.has_key(id):
- nodeids[resid] = {}
-
- # update the links for this klass nodeid
- node_dict = nodeids[resid]
- if not node_dict.has_key(linkprop):
- node_dict[linkprop] = [nodeid]
- elif node_dict.has_key(linkprop):
- node_dict[linkprop].append(nodeid)
+ # make sure the class is a linked one, otherwise ignore
+ if not designator_propname.has_key(classname):
+ continue
+
+ # it's a linked class - set up to do the klass.find
+ linkprop = designator_propname[classname] # eg, msg -> messages
+ propspec[linkprop][nodeid] = 1
+
+ # retain only the meaningful entries
+ for propname, idset in propspec.items():
+ if not idset:
+ del propspec[propname]
+
+ # klass.find tells me the klass nodeids the linked nodes relate to
+ for resid in klass.find(**propspec):
+ resid = str(resid)
+ if not nodeids.has_key(id):
+ nodeids[resid] = {}
+ node_dict = nodeids[resid]
+ # now figure out where it came from
+ for linkprop in propspec.keys():
+ for nodeid in klass.get(resid, linkprop):
+ if propspec[linkprop].has_key(nodeid):
+ # OK, this node[propname] has a winner
+ if not node_dict.has_key(linkprop):
+ node_dict[linkprop] = [nodeid]
+ else:
+ node_dict[linkprop].append(nodeid)
return nodeids
# we override this to ignore not 2 < word < 25 and also to fix a bug -
try:
f = open(self.indexdb + segment, 'rb')
except IOError, error:
- if error.errno != errno.ENOENT:
- raise
+ # probably just nonexistent segment index file
+ if error.errno != errno.ENOENT: raise
else:
pickle_str = zlib.decompress(f.read())
f.close()
for segment in self.segments:
try:
os.remove(self.indexdb + segment)
- except OSError:
+ except OSError, error:
# probably just nonexistent segment index file
- # TODO: make sure it's an EEXIST
- pass
+ if error.errno != errno.ENOENT: raise
# First write the much simpler filename/fileid dictionaries
dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids}
def purge_entry(self, identifier):
''' Remove a file from file index and word index
'''
+ self.load_index()
+
if not self.files.has_key(identifier):
return
return (hasattr(self,'fileids') and hasattr(self,'files') and
hasattr(self,'words'))
-#
-#$Log: not supported by cvs2svn $
-#Revision 1.6 2002/07/09 04:26:44 richard
-#We're indexing numbers now, and _underscore words
-#
-#Revision 1.5 2002/07/09 04:19:09 richard
-#Added reindex command to roundup-admin.
-#Fixed reindex on first access.
-#Also fixed reindexing of entries that change.
-#
-#Revision 1.4 2002/07/09 03:02:52 richard
-#More indexer work:
-#- all String properties may now be indexed too. Currently there's a bit of
-# "issue" specific code in the actual searching which needs to be
-# addressed. In a nutshell:
-# + pass 'indexme="yes"' as a String() property initialisation arg, eg:
-# file = FileClass(db, "file", name=String(), type=String(),
-# comment=String(indexme="yes"))
-# + the comment will then be indexed and be searchable, with the results
-# related back to the issue that the file is linked to
-#- as a result of this work, the FileClass has a default MIME type that may
-# be overridden in a subclass, or by the use of a "type" property as is
-# done in the default templates.
-#- the regeneration of the indexes (if necessary) is done once the schema is
-# set up in the dbinit.
-#
-#Revision 1.3 2002/07/08 06:58:15 richard
-#cleaned up the indexer code:
-# - it splits more words out (much simpler, faster splitter)
-# - removed code we'll never use (roundup.roundup_indexer has the full
-# implementation, and replaces roundup.indexer)
-# - only index text/plain and rfc822/message (ideas for other text formats to
-# index are welcome)
-# - added simple unit test for indexer. Needs more tests for regression.
-#
-#Revision 1.2 2002/05/25 07:16:24 rochecompaan
-#Merged search_indexing-branch with HEAD
-#
-#Revision 1.1.2.3 2002/05/02 11:52:12 rochecompaan
-#Fixed small bug that prevented indexes from being generated.
-#
-#Revision 1.1.2.2 2002/04/19 19:54:42 rochecompaan
-#cgi_client.py
-# removed search link for the time being
-# moved rendering of matches to htmltemplate
-#hyperdb.py
-# filtering of nodes on full text search incorporated in filter method
-#roundupdb.py
-# added paramater to call of filter method
-#roundup_indexer.py
-# added search method to RoundupIndexer class
-#
-#Revision 1.1.2.1 2002/04/03 11:55:57 rochecompaan
-# . Added feature #526730 - search for messages capability
-#
+# vim: set filetype=python ts=4 sw=4 et si