X-Git-Url: https://git.tokkee.org/?a=blobdiff_plain;f=roundup%2Findexer.py;h=4295b70167573e4b76e749abe51a466b1496be12;hb=efc777b7a124948b50aa5fa521cba8bde4486fc1;hp=d09e950d8102a1b6843ea8150d8068f8b491a7ba;hpb=d195729acba7813a8e314a76d3a6ba73f6dc9f1f;p=roundup.git diff --git a/roundup/indexer.py b/roundup/indexer.py index d09e950..4295b70 100644 --- a/roundup/indexer.py +++ b/roundup/indexer.py @@ -14,13 +14,14 @@ # that promote freedom, but obviously am giving up any rights # to compel such. # -#$Id: indexer.py,v 1.6 2002-07-09 04:26:44 richard Exp $ +#$Id: indexer.py,v 1.13 2002-09-10 00:18:20 richard Exp $ ''' This module provides an indexer class, RoundupIndexer, that stores text indices in a roundup instance. This class makes searching the content of -messages and text files possible. +messages, string properties and text files possible. ''' import os, shutil, re, mimetypes, marshal, zlib, errno +from hyperdb import Link, Multilink class Indexer: ''' Indexes information from roundup's hyperdb to allow efficient @@ -30,20 +31,26 @@ class Indexer: files {identifier: (fileid, wordcount)} words {word: {fileid: count}} fileids {fileid: identifier} + where identifier is (classname, nodeid, propertyname) ''' def __init__(self, db_path): self.indexdb_path = os.path.join(db_path, 'indexes') self.indexdb = os.path.join(self.indexdb_path, 'index.db') self.reindex = 0 - self.casesensitive = 0 self.quiet = 9 + self.changed = 0 # see if we need to reindex because of a change in code + version = os.path.join(self.indexdb_path, 'version') if (not os.path.exists(self.indexdb_path) or - not os.path.exists(os.path.join(self.indexdb_path, 'version'))): - # TODO: if the version file exists (in the future) we'll want to - # check the value in it - for now the file itself is a flag + not os.path.exists(version)): + # for now the file itself is a flag self.force_reindex() + elif os.path.exists(version): + version = open(version).read() + # check the value and reindex if it's not the latest + if version.strip() != '1': + self.force_reindex() def force_reindex(self): '''Force a reindex condition @@ -54,6 +61,7 @@ class Indexer: os.chmod(self.indexdb_path, 0775) open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n') self.reindex = 1 + self.changed = 1 def should_reindex(self): '''Should we reindex? @@ -102,21 +110,23 @@ class Indexer: # make a reference to the file for this word entry[file_index] = filedict[word] + # save needed + self.changed = 1 + def splitter(self, text, ftype): ''' Split the contents of a text string into a list of 'words' ''' if ftype == 'text/plain': - words = self.text_splitter(text, self.casesensitive) + words = self.text_splitter(text) else: return [] return words - def text_splitter(self, text, casesensitive=0): + def text_splitter(self, text): """Split text/plain string into a list of words """ - # Let's adjust case if not case-sensitive - if not casesensitive: - text = text.upper() + # case insensitive + text = text.upper() # Split the raw text, losing anything longer than 25 characters # since that'll be gibberish (encoded text or somesuch) or shorter @@ -136,12 +146,18 @@ class Indexer: if not hits: return {} - # this is specific to "issue" klass ... eugh - designator_propname = {'msg': 'messages', 'file': 'files'} + #designator_propname = {'msg': 'messages', 'file': 'files'} + designator_propname = {} + for nm, propclass in klass.getprops().items(): + if isinstance(propclass, Link) or isinstance(propclass, Multilink): + designator_propname[propclass.classname] = nm # build a dictionary of nodes and their associated messages # and files - nodeids = {} + nodeids = {} # this is the answer + propspec = {} # used to do the klass.find + for propname in designator_propname.values(): + propspec[propname] = {} # used as a set (value doesn't matter) for classname, nodeid, property in hits.values(): # skip this result if we don't care about this class/property if ignore.has_key((classname, property)): @@ -153,20 +169,30 @@ class Indexer: nodeids[nodeid] = {} continue - # it's a linked class - find the klass entries that are - # linked to it - linkprop = designator_propname[classname] - for resid in klass.find(**{linkprop: nodeid}): - resid = str(resid) - if not nodeids.has_key(id): - nodeids[resid] = {} - - # update the links for this klass nodeid - node_dict = nodeids[resid] - if not node_dict.has_key(linkprop): - node_dict[linkprop] = [nodeid] - elif node_dict.has_key(linkprop): - node_dict[linkprop].append(nodeid) + # it's a linked class - set up to do the klass.find + linkprop = designator_propname[classname] # eg, msg -> messages + propspec[linkprop][nodeid] = 1 + + # retain only the meaningful entries + for propname, idset in propspec.items(): + if not idset: + del propspec[propname] + + # klass.find tells me the klass nodeids the linked nodes relate to + for resid in klass.find(**propspec): + resid = str(resid) + if not nodeids.has_key(id): + nodeids[resid] = {} + node_dict = nodeids[resid] + # now figure out where it came from + for linkprop in propspec.keys(): + for nodeid in klass.get(resid, linkprop): + if propspec[linkprop].has_key(nodeid): + # OK, this node[propname] has a winner + if not node_dict.has_key(linkprop): + node_dict[linkprop] = [nodeid] + else: + node_dict[linkprop].append(nodeid) return nodeids # we override this to ignore not 2 < word < 25 and also to fix a bug - @@ -183,8 +209,7 @@ class Indexer: if not 2 < len(word) < 25: # word outside the bounds of what we index - ignore continue - if not self.casesensitive: - word = word.upper() + word = word.upper() entry = self.words.get(word) # For each word, get index entries[word] = entry # of matching files if not entry: # Nothing for this one word (fail) @@ -224,8 +249,8 @@ class Indexer: try: f = open(self.indexdb + segment, 'rb') except IOError, error: - if error.errno != errno.ENOENT: - raise + # probably just nonexistent segment index file + if error.errno != errno.ENOENT: raise else: pickle_str = zlib.decompress(f.read()) f.close() @@ -244,19 +269,20 @@ class Indexer: self.words = db['WORDS'] self.files = db['FILES'] self.fileids = db['FILEIDS'] + self.changed = 0 def save_index(self): - # make sure we're loaded - self.load_index() + # only save if the index is loaded and changed + if not self.index_loaded() or not self.changed: + return # brutal space saver... delete all the small segments for segment in self.segments: try: os.remove(self.indexdb + segment) - except OSError: + except OSError, error: # probably just nonexistent segment index file - # TODO: make sure it's an EEXIST - pass + if error.errno != errno.ENOENT: raise # First write the much simpler filename/fileid dictionaries dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids} @@ -280,9 +306,14 @@ class Indexer: pickle_fh.write(zlib.compress(pickle_str)) os.chmod(filename, 0664) + # save done + self.changed = 0 + def purge_entry(self, identifier): ''' Remove a file from file index and word index ''' + self.load_index() + if not self.files.has_key(identifier): return @@ -295,59 +326,11 @@ class Indexer: if occurs.has_key(file_index): del occurs[file_index] + # save needed + self.changed = 1 + def index_loaded(self): return (hasattr(self,'fileids') and hasattr(self,'files') and hasattr(self,'words')) -# -#$Log: not supported by cvs2svn $ -#Revision 1.5 2002/07/09 04:19:09 richard -#Added reindex command to roundup-admin. -#Fixed reindex on first access. -#Also fixed reindexing of entries that change. -# -#Revision 1.4 2002/07/09 03:02:52 richard -#More indexer work: -#- all String properties may now be indexed too. Currently there's a bit of -# "issue" specific code in the actual searching which needs to be -# addressed. In a nutshell: -# + pass 'indexme="yes"' as a String() property initialisation arg, eg: -# file = FileClass(db, "file", name=String(), type=String(), -# comment=String(indexme="yes")) -# + the comment will then be indexed and be searchable, with the results -# related back to the issue that the file is linked to -#- as a result of this work, the FileClass has a default MIME type that may -# be overridden in a subclass, or by the use of a "type" property as is -# done in the default templates. -#- the regeneration of the indexes (if necessary) is done once the schema is -# set up in the dbinit. -# -#Revision 1.3 2002/07/08 06:58:15 richard -#cleaned up the indexer code: -# - it splits more words out (much simpler, faster splitter) -# - removed code we'll never use (roundup.roundup_indexer has the full -# implementation, and replaces roundup.indexer) -# - only index text/plain and rfc822/message (ideas for other text formats to -# index are welcome) -# - added simple unit test for indexer. Needs more tests for regression. -# -#Revision 1.2 2002/05/25 07:16:24 rochecompaan -#Merged search_indexing-branch with HEAD -# -#Revision 1.1.2.3 2002/05/02 11:52:12 rochecompaan -#Fixed small bug that prevented indexes from being generated. -# -#Revision 1.1.2.2 2002/04/19 19:54:42 rochecompaan -#cgi_client.py -# removed search link for the time being -# moved rendering of matches to htmltemplate -#hyperdb.py -# filtering of nodes on full text search incorporated in filter method -#roundupdb.py -# added paramater to call of filter method -#roundup_indexer.py -# added search method to RoundupIndexer class -# -#Revision 1.1.2.1 2002/04/03 11:55:57 rochecompaan -# . Added feature #526730 - search for messages capability -# +# vim: set filetype=python ts=4 sw=4 et si