From 73ac29efd25c466f381bb0963e93ac2f80331f94 Mon Sep 17 00:00:00 2001 From: gmcm Date: Tue, 9 Jul 2002 21:53:38 +0000 Subject: [PATCH] Optimize Class.find so that the propspec can contain a set of ids to match. This is used by indexer.search so it can do just one find for all the index matches. This was already confusing code, but for common terms (lots of index matches), it is enormously faster. git-svn-id: http://svn.roundup-tracker.org/svnroot/roundup/trunk@848 57a73879-2fb5-44c3-a270-3262357dd7e2 --- roundup/hyperdb.py | 56 +++++++++++++++++++++++++++++++++---------- roundup/indexer.py | 60 +++++++++++++++++++++++++++++++--------------- 2 files changed, 84 insertions(+), 32 deletions(-) diff --git a/roundup/hyperdb.py b/roundup/hyperdb.py index efbc716..ce97cfb 100644 --- a/roundup/hyperdb.py +++ b/roundup/hyperdb.py @@ -15,7 +15,7 @@ # BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE, # SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. # -# $Id: hyperdb.py,v 1.71 2002-07-09 03:02:52 richard Exp $ +# $Id: hyperdb.py,v 1.72 2002-07-09 21:53:38 gmcm Exp $ __doc__ = """ Hyperdatabase implementation, especially field types. @@ -785,24 +785,28 @@ class Class: # XXX: change from spec - allows multiple props to match def find(self, **propspec): - """Get the ids of nodes in this class which link to a given node. + """Get the ids of nodes in this class which link to the given nodes. - 'propspec' consists of keyword args propname=nodeid + 'propspec' consists of keyword args propname={nodeid:1,} 'propname' must be the name of a property in this class, or a KeyError is raised. That property must be a Link or Multilink property, or a TypeError is raised. - 'nodeid' must be the id of an existing node in the class linked - to by the given property, or an IndexError is raised. + Any node in this class whose 'propname' property links to any of the + nodeids will be returned. Used by the full text indexing, which knows + that "foo" occurs in msg1, msg3 and file7, so we have hits on these issues: + db.issue.find(messages={'1':1,'3':1}, files={'7':1}) """ propspec = propspec.items() - for propname, nodeid in propspec: + for propname, nodeids in propspec: # check the prop is OK prop = self.properties[propname] if not isinstance(prop, Link) and not isinstance(prop, Multilink): raise TypeError, "'%s' not a Link/Multilink property"%propname - if not self.db.hasnode(prop.classname, nodeid): - raise ValueError, '%s has no node %s'%(prop.classname, nodeid) + #XXX edit is expensive and of questionable use + #for nodeid in nodeids: + # if not self.db.hasnode(prop.classname, nodeid): + # raise ValueError, '%s has no node %s'%(prop.classname, nodeid) # ok, now do the find cldb = self.db.getclassdb(self.classname) @@ -811,16 +815,26 @@ class Class: node = self.db.getnode(self.classname, id, db=cldb) if node.has_key(self.db.RETIRED_FLAG): continue - for propname, nodeid in propspec: + for propname, nodeids in propspec: # can't test if the node doesn't have this property if not node.has_key(propname): continue + if type(nodeids) is type(''): + nodeids = {nodeids:1} prop = self.properties[propname] - property = node[propname] - if isinstance(prop, Link) and nodeid == property: - l.append(id) - elif isinstance(prop, Multilink) and nodeid in property: + value = node[propname] + if isinstance(prop, Link) and nodeids.has_key(value): l.append(id) + break + elif isinstance(prop, Multilink): + hit = 0 + for v in value: + if nodeids.has_key(v): + l.append(id) + hit = 1 + break + if hit: + break return l def stringFind(self, **requirements): @@ -1185,6 +1199,22 @@ def Choice(name, db, *options): # # $Log: not supported by cvs2svn $ +# Revision 1.71 2002/07/09 03:02:52 richard +# More indexer work: +# - all String properties may now be indexed too. Currently there's a bit of +# "issue" specific code in the actual searching which needs to be +# addressed. In a nutshell: +# + pass 'indexme="yes"' as a String() property initialisation arg, eg: +# file = FileClass(db, "file", name=String(), type=String(), +# comment=String(indexme="yes")) +# + the comment will then be indexed and be searchable, with the results +# related back to the issue that the file is linked to +# - as a result of this work, the FileClass has a default MIME type that may +# be overridden in a subclass, or by the use of a "type" property as is +# done in the default templates. +# - the regeneration of the indexes (if necessary) is done once the schema is +# set up in the dbinit. +# # Revision 1.70 2002/06/27 12:06:20 gmcm # Improve an error message. # diff --git a/roundup/indexer.py b/roundup/indexer.py index a1edc71..9cd0cdc 100644 --- a/roundup/indexer.py +++ b/roundup/indexer.py @@ -14,13 +14,14 @@ # that promote freedom, but obviously am giving up any rights # to compel such. # -#$Id: indexer.py,v 1.7 2002-07-09 21:38:43 richard Exp $ +#$Id: indexer.py,v 1.8 2002-07-09 21:53:38 gmcm Exp $ ''' This module provides an indexer class, RoundupIndexer, that stores text indices in a roundup instance. This class makes searching the content of -messages and text files possible. +messages, string properties and text files possible. ''' import os, shutil, re, mimetypes, marshal, zlib, errno +from hyperdb import Link, Multilink class Indexer: ''' Indexes information from roundup's hyperdb to allow efficient @@ -30,6 +31,7 @@ class Indexer: files {identifier: (fileid, wordcount)} words {word: {fileid: count}} fileids {fileid: identifier} + where identifier is (classname, nodeid, propertyname) ''' def __init__(self, db_path): self.indexdb_path = os.path.join(db_path, 'indexes') @@ -139,12 +141,18 @@ class Indexer: if not hits: return {} - # this is specific to "issue" klass ... eugh - designator_propname = {'msg': 'messages', 'file': 'files'} + #designator_propname = {'msg': 'messages', 'file': 'files'} + designator_propname = {} + for nm, propclass in klass.getprops().items(): + if isinstance(propclass, Link) or isinstance(propclass, Multilink): + designator_propname[propclass.classname] = nm # build a dictionary of nodes and their associated messages # and files - nodeids = {} + nodeids = {} # this is the answer + propspec = {} # used to do the klass.find + for propname in designator_propname.values(): + propspec[propname] = {} # used as a set (value doesn't matter) for classname, nodeid, property in hits.values(): # skip this result if we don't care about this class/property if ignore.has_key((classname, property)): @@ -156,20 +164,30 @@ class Indexer: nodeids[nodeid] = {} continue - # it's a linked class - find the klass entries that are - # linked to it - linkprop = designator_propname[classname] - for resid in klass.find(**{linkprop: nodeid}): - resid = str(resid) - if not nodeids.has_key(id): - nodeids[resid] = {} - - # update the links for this klass nodeid - node_dict = nodeids[resid] - if not node_dict.has_key(linkprop): - node_dict[linkprop] = [nodeid] - elif node_dict.has_key(linkprop): - node_dict[linkprop].append(nodeid) + # it's a linked class - set up to do the klass.find + linkprop = designator_propname[classname] # eg, msg -> messages + propspec[linkprop][nodeid] = 1 + + # retain only the meaningful entries + for propname, idset in propspec.items(): + if not idset: + del propspec[propname] + + # klass.find tells me the klass nodeids the linked nodes relate to + for resid in klass.find(**propspec): + resid = str(resid) + if not nodeids.has_key(id): + nodeids[resid] = {} + node_dict = nodeids[resid] + # now figure out where it came from + for linkprop in propspec.keys(): + for nodeid in klass.get(resid, linkprop): + if propspec[linkprop].has_key(nodeid): + # OK, this node[propname] has a winner + if not node_dict.has_key(linkprop): + node_dict[linkprop] = [nodeid] + else: + node_dict[linkprop].append(nodeid) return nodeids # we override this to ignore not 2 < word < 25 and also to fix a bug - @@ -311,6 +329,10 @@ class Indexer: # #$Log: not supported by cvs2svn $ +#Revision 1.7 2002/07/09 21:38:43 richard +#Only save the index if the thing is loaded and changed. Also, don't load +#the index just for a save. +# #Revision 1.6 2002/07/09 04:26:44 richard #We're indexing numbers now, and _underscore words # -- 2.30.2