X-Git-Url: https://git.tokkee.org/?a=blobdiff_plain;f=roundup%2Findexer.py;h=4cd5d24ad1b763e5571f1d4bc25233a22715594d;hb=53ea64cdb21d7beee1f69c3b7057d97ec4e84a74;hp=35e5a2990ba0f6da40b5171fcb977d4bc3f9ecfa;hpb=369fb117ee3618ba00b5386cbaa4b4f19c2db4a4;p=roundup.git diff --git a/roundup/indexer.py b/roundup/indexer.py index 35e5a29..4cd5d24 100644 --- a/roundup/indexer.py +++ b/roundup/indexer.py @@ -14,24 +14,27 @@ # that promote freedom, but obviously am giving up any rights # to compel such. # -#$Id: indexer.py,v 1.11 2002-07-18 11:17:30 gmcm Exp $ -''' -This module provides an indexer class, RoundupIndexer, that stores text +#$Id: indexer.py,v 1.18 2004-02-11 23:55:08 richard Exp $ +'''This module provides an indexer class, RoundupIndexer, that stores text indices in a roundup instance. This class makes searching the content of messages, string properties and text files possible. ''' +__docformat__ = 'restructuredtext' + import os, shutil, re, mimetypes, marshal, zlib, errno from hyperdb import Link, Multilink class Indexer: - ''' Indexes information from roundup's hyperdb to allow efficient - searching. + '''Indexes information from roundup's hyperdb to allow efficient + searching. + + Three structures are created by the indexer:: - Three structures are created by the indexer: files {identifier: (fileid, wordcount)} words {word: {fileid: count}} fileids {fileid: identifier} - where identifier is (classname, nodeid, propertyname) + + where identifier is (classname, nodeid, propertyname) ''' def __init__(self, db_path): self.indexdb_path = os.path.join(db_path, 'indexes') @@ -69,8 +72,8 @@ class Indexer: return self.reindex def add_text(self, identifier, text, mime_type='text/plain'): - ''' Add some text associated with the (classname, nodeid, property) - identifier. + '''Add some text associated with the (classname, nodeid, property) + identifier. ''' # make sure the index is loaded self.load_index() @@ -114,7 +117,7 @@ class Indexer: self.changed = 1 def splitter(self, text, ftype): - ''' Split the contents of a text string into a list of 'words' + '''Split the contents of a text string into a list of 'words' ''' if ftype == 'text/plain': words = self.text_splitter(text) @@ -136,17 +139,16 @@ class Indexer: def search(self, search_terms, klass, ignore={}, dre=re.compile(r'([^\d]+)(\d+)')): - ''' Display search results looking for [search, terms] associated - with the hyperdb Class "klass". Ignore hits on {class: property}. + '''Display search results looking for [search, terms] associated + with the hyperdb Class "klass". Ignore hits on {class: property}. - "dre" is a helper, not an argument. + "dre" is a helper, not an argument. ''' # do the index lookup hits = self.find(search_terms) if not hits: return {} - #designator_propname = {'msg': 'messages', 'file': 'files'} designator_propname = {} for nm, propclass in klass.getprops().items(): if isinstance(propclass, Link) or isinstance(propclass, Multilink): @@ -154,7 +156,7 @@ class Indexer: # build a dictionary of nodes and their associated messages # and files - nodeids = {} # this is the answer + nodeids = {} # this is the answer propspec = {} # used to do the klass.find for propname in designator_propname.values(): propspec[propname] = {} # used as a set (value doesn't matter) @@ -169,6 +171,10 @@ class Indexer: nodeids[nodeid] = {} continue + # make sure the class is a linked one, otherwise ignore + if not designator_propname.has_key(classname): + continue + # it's a linked class - set up to do the klass.find linkprop = designator_propname[classname] # eg, msg -> messages propspec[linkprop][nodeid] = 1 @@ -198,7 +204,7 @@ class Indexer: # we override this to ignore not 2 < word < 25 and also to fix a bug - # the (fail) case. def find(self, wordlist): - ''' Locate files that match ALL the words in wordlist + '''Locate files that match ALL the words in wordlist ''' if not hasattr(self, 'words'): self.load_index() @@ -217,6 +223,8 @@ class Indexer: if hits is None: hits = {} for k in entry.keys(): + if not self.fileids.has_key(k): + raise ValueError, 'Index is corrupted: re-generate it' hits[k] = self.fileids[k] else: # Eliminate hits for every non-match @@ -310,8 +318,10 @@ class Indexer: self.changed = 0 def purge_entry(self, identifier): - ''' Remove a file from file index and word index + '''Remove a file from file index and word index ''' + self.load_index() + if not self.files.has_key(identifier): return @@ -331,74 +341,4 @@ class Indexer: return (hasattr(self,'fileids') and hasattr(self,'files') and hasattr(self,'words')) -# -#$Log: not supported by cvs2svn $ -#Revision 1.10 2002/07/14 23:17:24 richard -#oops -# -#Revision 1.9 2002/07/14 06:11:16 richard -#Some TODOs -# -#Revision 1.8 2002/07/09 21:53:38 gmcm -#Optimize Class.find so that the propspec can contain a set of ids to match. -#This is used by indexer.search so it can do just one find for all the index matches. -#This was already confusing code, but for common terms (lots of index matches), -#it is enormously faster. -# -#Revision 1.7 2002/07/09 21:38:43 richard -#Only save the index if the thing is loaded and changed. Also, don't load -#the index just for a save. -# -#Revision 1.6 2002/07/09 04:26:44 richard -#We're indexing numbers now, and _underscore words -# -#Revision 1.5 2002/07/09 04:19:09 richard -#Added reindex command to roundup-admin. -#Fixed reindex on first access. -#Also fixed reindexing of entries that change. -# -#Revision 1.4 2002/07/09 03:02:52 richard -#More indexer work: -#- all String properties may now be indexed too. Currently there's a bit of -# "issue" specific code in the actual searching which needs to be -# addressed. In a nutshell: -# + pass 'indexme="yes"' as a String() property initialisation arg, eg: -# file = FileClass(db, "file", name=String(), type=String(), -# comment=String(indexme="yes")) -# + the comment will then be indexed and be searchable, with the results -# related back to the issue that the file is linked to -#- as a result of this work, the FileClass has a default MIME type that may -# be overridden in a subclass, or by the use of a "type" property as is -# done in the default templates. -#- the regeneration of the indexes (if necessary) is done once the schema is -# set up in the dbinit. -# -#Revision 1.3 2002/07/08 06:58:15 richard -#cleaned up the indexer code: -# - it splits more words out (much simpler, faster splitter) -# - removed code we'll never use (roundup.roundup_indexer has the full -# implementation, and replaces roundup.indexer) -# - only index text/plain and rfc822/message (ideas for other text formats to -# index are welcome) -# - added simple unit test for indexer. Needs more tests for regression. -# -#Revision 1.2 2002/05/25 07:16:24 rochecompaan -#Merged search_indexing-branch with HEAD -# -#Revision 1.1.2.3 2002/05/02 11:52:12 rochecompaan -#Fixed small bug that prevented indexes from being generated. -# -#Revision 1.1.2.2 2002/04/19 19:54:42 rochecompaan -#cgi_client.py -# removed search link for the time being -# moved rendering of matches to htmltemplate -#hyperdb.py -# filtering of nodes on full text search incorporated in filter method -#roundupdb.py -# added paramater to call of filter method -#roundup_indexer.py -# added search method to RoundupIndexer class -# -#Revision 1.1.2.1 2002/04/03 11:55:57 rochecompaan -# . Added feature #526730 - search for messages capability -# +# vim: set filetype=python ts=4 sw=4 et si