diff --git a/roundup/indexer.py b/roundup/indexer.py
index 35e5a2990ba0f6da40b5171fcb977d4bc3f9ecfa..4cd5d24ad1b763e5571f1d4bc25233a22715594d 100644 (file)
--- a/roundup/indexer.py
+++ b/roundup/indexer.py
# that promote freedom, but obviously am giving up any rights
# to compel such.
#
-#$Id: indexer.py,v 1.11 2002-07-18 11:17:30 gmcm Exp $
-'''
-This module provides an indexer class, RoundupIndexer, that stores text
+#$Id: indexer.py,v 1.18 2004-02-11 23:55:08 richard Exp $
+'''This module provides an indexer class, RoundupIndexer, that stores text
indices in a roundup instance. This class makes searching the content of
messages, string properties and text files possible.
'''
+__docformat__ = 'restructuredtext'
+
import os, shutil, re, mimetypes, marshal, zlib, errno
from hyperdb import Link, Multilink
class Indexer:
- ''' Indexes information from roundup's hyperdb to allow efficient
- searching.
+ '''Indexes information from roundup's hyperdb to allow efficient
+ searching.
+
+ Three structures are created by the indexer::
- Three structures are created by the indexer:
files {identifier: (fileid, wordcount)}
words {word: {fileid: count}}
fileids {fileid: identifier}
- where identifier is (classname, nodeid, propertyname)
+
+ where identifier is (classname, nodeid, propertyname)
'''
def __init__(self, db_path):
self.indexdb_path = os.path.join(db_path, 'indexes')
return self.reindex
def add_text(self, identifier, text, mime_type='text/plain'):
- ''' Add some text associated with the (classname, nodeid, property)
- identifier.
+ '''Add some text associated with the (classname, nodeid, property)
+ identifier.
'''
# make sure the index is loaded
self.load_index()
self.changed = 1
def splitter(self, text, ftype):
- ''' Split the contents of a text string into a list of 'words'
+ '''Split the contents of a text string into a list of 'words'
'''
if ftype == 'text/plain':
words = self.text_splitter(text)
def search(self, search_terms, klass, ignore={},
dre=re.compile(r'([^\d]+)(\d+)')):
- ''' Display search results looking for [search, terms] associated
- with the hyperdb Class "klass". Ignore hits on {class: property}.
+ '''Display search results looking for [search, terms] associated
+ with the hyperdb Class "klass". Ignore hits on {class: property}.
- "dre" is a helper, not an argument.
+ "dre" is a helper, not an argument.
'''
# do the index lookup
hits = self.find(search_terms)
if not hits:
return {}
- #designator_propname = {'msg': 'messages', 'file': 'files'}
designator_propname = {}
for nm, propclass in klass.getprops().items():
if isinstance(propclass, Link) or isinstance(propclass, Multilink):
# build a dictionary of nodes and their associated messages
# and files
- nodeids = {} # this is the answer
+ nodeids = {} # this is the answer
propspec = {} # used to do the klass.find
for propname in designator_propname.values():
propspec[propname] = {} # used as a set (value doesn't matter)
nodeids[nodeid] = {}
continue
+ # make sure the class is a linked one, otherwise ignore
+ if not designator_propname.has_key(classname):
+ continue
+
# it's a linked class - set up to do the klass.find
linkprop = designator_propname[classname] # eg, msg -> messages
propspec[linkprop][nodeid] = 1
# we override this to ignore not 2 < word < 25 and also to fix a bug -
# the (fail) case.
def find(self, wordlist):
- ''' Locate files that match ALL the words in wordlist
+ '''Locate files that match ALL the words in wordlist
'''
if not hasattr(self, 'words'):
self.load_index()
if hits is None:
hits = {}
for k in entry.keys():
+ if not self.fileids.has_key(k):
+ raise ValueError, 'Index is corrupted: re-generate it'
hits[k] = self.fileids[k]
else:
# Eliminate hits for every non-match
self.changed = 0
def purge_entry(self, identifier):
- ''' Remove a file from file index and word index
+ '''Remove a file from file index and word index
'''
+ self.load_index()
+
if not self.files.has_key(identifier):
return
return (hasattr(self,'fileids') and hasattr(self,'files') and
hasattr(self,'words'))
-#
-#$Log: not supported by cvs2svn $
-#Revision 1.10 2002/07/14 23:17:24 richard
-#oops
-#
-#Revision 1.9 2002/07/14 06:11:16 richard
-#Some TODOs
-#
-#Revision 1.8 2002/07/09 21:53:38 gmcm
-#Optimize Class.find so that the propspec can contain a set of ids to match.
-#This is used by indexer.search so it can do just one find for all the index matches.
-#This was already confusing code, but for common terms (lots of index matches),
-#it is enormously faster.
-#
-#Revision 1.7 2002/07/09 21:38:43 richard
-#Only save the index if the thing is loaded and changed. Also, don't load
-#the index just for a save.
-#
-#Revision 1.6 2002/07/09 04:26:44 richard
-#We're indexing numbers now, and _underscore words
-#
-#Revision 1.5 2002/07/09 04:19:09 richard
-#Added reindex command to roundup-admin.
-#Fixed reindex on first access.
-#Also fixed reindexing of entries that change.
-#
-#Revision 1.4 2002/07/09 03:02:52 richard
-#More indexer work:
-#- all String properties may now be indexed too. Currently there's a bit of
-# "issue" specific code in the actual searching which needs to be
-# addressed. In a nutshell:
-# + pass 'indexme="yes"' as a String() property initialisation arg, eg:
-# file = FileClass(db, "file", name=String(), type=String(),
-# comment=String(indexme="yes"))
-# + the comment will then be indexed and be searchable, with the results
-# related back to the issue that the file is linked to
-#- as a result of this work, the FileClass has a default MIME type that may
-# be overridden in a subclass, or by the use of a "type" property as is
-# done in the default templates.
-#- the regeneration of the indexes (if necessary) is done once the schema is
-# set up in the dbinit.
-#
-#Revision 1.3 2002/07/08 06:58:15 richard
-#cleaned up the indexer code:
-# - it splits more words out (much simpler, faster splitter)
-# - removed code we'll never use (roundup.roundup_indexer has the full
-# implementation, and replaces roundup.indexer)
-# - only index text/plain and rfc822/message (ideas for other text formats to
-# index are welcome)
-# - added simple unit test for indexer. Needs more tests for regression.
-#
-#Revision 1.2 2002/05/25 07:16:24 rochecompaan
-#Merged search_indexing-branch with HEAD
-#
-#Revision 1.1.2.3 2002/05/02 11:52:12 rochecompaan
-#Fixed small bug that prevented indexes from being generated.
-#
-#Revision 1.1.2.2 2002/04/19 19:54:42 rochecompaan
-#cgi_client.py
-# removed search link for the time being
-# moved rendering of matches to htmltemplate
-#hyperdb.py
-# filtering of nodes on full text search incorporated in filter method
-#roundupdb.py
-# added paramater to call of filter method
-#roundup_indexer.py
-# added search method to RoundupIndexer class
-#
-#Revision 1.1.2.1 2002/04/03 11:55:57 rochecompaan
-# . Added feature #526730 - search for messages capability
-#
+# vim: set filetype=python ts=4 sw=4 et si