X-Git-Url: https://git.tokkee.org/?a=blobdiff_plain;ds=sidebyside;f=roundup%2Findexer.py;h=814425a5848798585704549ad35f965be45e19c8;hb=c962438200659241361343528c1f31130fa0e7ca;hp=096b6c6dda28b589db9d29c62a2b510269f6aaf0;hpb=3ee3a04f9c1a4d3a911e633347c72d61be685019;p=roundup.git diff --git a/roundup/indexer.py b/roundup/indexer.py index 096b6c6..814425a 100644 --- a/roundup/indexer.py +++ b/roundup/indexer.py @@ -14,13 +14,14 @@ # that promote freedom, but obviously am giving up any rights # to compel such. # -#$Id: indexer.py,v 1.5 2002-07-09 04:19:09 richard Exp $ +#$Id: indexer.py,v 1.10 2002-07-14 23:17:24 richard Exp $ ''' This module provides an indexer class, RoundupIndexer, that stores text indices in a roundup instance. This class makes searching the content of -messages and text files possible. +messages, string properties and text files possible. ''' import os, shutil, re, mimetypes, marshal, zlib, errno +from hyperdb import Link, Multilink class Indexer: ''' Indexes information from roundup's hyperdb to allow efficient @@ -30,20 +31,26 @@ class Indexer: files {identifier: (fileid, wordcount)} words {word: {fileid: count}} fileids {fileid: identifier} + where identifier is (classname, nodeid, propertyname) ''' def __init__(self, db_path): self.indexdb_path = os.path.join(db_path, 'indexes') self.indexdb = os.path.join(self.indexdb_path, 'index.db') self.reindex = 0 - self.casesensitive = 0 self.quiet = 9 + self.changed = 0 # see if we need to reindex because of a change in code + version = os.path.join(self.indexdb_path, 'version') if (not os.path.exists(self.indexdb_path) or - not os.path.exists(os.path.join(self.indexdb_path, 'version'))): - # TODO: if the version file exists (in the future) we'll want to - # check the value in it - for now the file itself is a flag + not os.path.exists(version)): + # for now the file itself is a flag self.force_reindex() + elif os.path.exists(version): + version = open(version).read() + # check the value and reindex if it's not the latest + if version != '1': + self.force_reindex() def force_reindex(self): '''Force a reindex condition @@ -54,6 +61,7 @@ class Indexer: os.chmod(self.indexdb_path, 0775) open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n') self.reindex = 1 + self.changed = 1 def should_reindex(self): '''Should we reindex? @@ -102,21 +110,23 @@ class Indexer: # make a reference to the file for this word entry[file_index] = filedict[word] + # save needed + self.changed = 1 + def splitter(self, text, ftype): ''' Split the contents of a text string into a list of 'words' ''' if ftype == 'text/plain': - words = self.text_splitter(text, self.casesensitive) + words = self.text_splitter(text) else: return [] return words - def text_splitter(self, text, casesensitive=0): + def text_splitter(self, text): """Split text/plain string into a list of words """ - # Let's adjust case if not case-sensitive - if not casesensitive: - text = text.upper() + # case insensitive + text = text.upper() # Split the raw text, losing anything longer than 25 characters # since that'll be gibberish (encoded text or somesuch) or shorter @@ -136,12 +146,18 @@ class Indexer: if not hits: return {} - # this is specific to "issue" klass ... eugh - designator_propname = {'msg': 'messages', 'file': 'files'} + #designator_propname = {'msg': 'messages', 'file': 'files'} + designator_propname = {} + for nm, propclass in klass.getprops().items(): + if isinstance(propclass, Link) or isinstance(propclass, Multilink): + designator_propname[propclass.classname] = nm # build a dictionary of nodes and their associated messages # and files - nodeids = {} + nodeids = {} # this is the answer + propspec = {} # used to do the klass.find + for propname in designator_propname.values(): + propspec[propname] = {} # used as a set (value doesn't matter) for classname, nodeid, property in hits.values(): # skip this result if we don't care about this class/property if ignore.has_key((classname, property)): @@ -153,20 +169,30 @@ class Indexer: nodeids[nodeid] = {} continue - # it's a linked class - find the klass entries that are - # linked to it - linkprop = designator_propname[classname] - for resid in klass.find(**{linkprop: nodeid}): - resid = str(resid) - if not nodeids.has_key(id): - nodeids[resid] = {} - - # update the links for this klass nodeid - node_dict = nodeids[resid] - if not node_dict.has_key(linkprop): - node_dict[linkprop] = [nodeid] - elif node_dict.has_key(linkprop): - node_dict[linkprop].append(nodeid) + # it's a linked class - set up to do the klass.find + linkprop = designator_propname[classname] # eg, msg -> messages + propspec[linkprop][nodeid] = 1 + + # retain only the meaningful entries + for propname, idset in propspec.items(): + if not idset: + del propspec[propname] + + # klass.find tells me the klass nodeids the linked nodes relate to + for resid in klass.find(**propspec): + resid = str(resid) + if not nodeids.has_key(id): + nodeids[resid] = {} + node_dict = nodeids[resid] + # now figure out where it came from + for linkprop in propspec.keys(): + for nodeid in klass.get(resid, linkprop): + if propspec[linkprop].has_key(nodeid): + # OK, this node[propname] has a winner + if not node_dict.has_key(linkprop): + node_dict[linkprop] = [nodeid] + else: + node_dict[linkprop].append(nodeid) return nodeids # we override this to ignore not 2 < word < 25 and also to fix a bug - @@ -183,8 +209,7 @@ class Indexer: if not 2 < len(word) < 25: # word outside the bounds of what we index - ignore continue - if not self.casesensitive: - word = word.upper() + word = word.upper() entry = self.words.get(word) # For each word, get index entries[word] = entry # of matching files if not entry: # Nothing for this one word (fail) @@ -202,7 +227,7 @@ class Indexer: return {} return hits - segments = "ABCDEFGHIJKLMNOPQRSTUVWXYZ#-!" + segments = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_-!" def load_index(self, reload=0, wordlist=None): # Unless reload is indicated, do not load twice if self.index_loaded() and not reload: @@ -224,8 +249,8 @@ class Indexer: try: f = open(self.indexdb + segment, 'rb') except IOError, error: - if error.errno != errno.ENOENT: - raise + # probably just nonexistent segment index file + if error.errno != errno.ENOENT: raise else: pickle_str = zlib.decompress(f.read()) f.close() @@ -244,26 +269,27 @@ class Indexer: self.words = db['WORDS'] self.files = db['FILES'] self.fileids = db['FILEIDS'] + self.changed = 0 def save_index(self): - # make sure we're loaded - self.load_index() + # only save if the index is loaded and changed + if not self.index_loaded() or not self.changed: + return # brutal space saver... delete all the small segments for segment in self.segments: try: os.remove(self.indexdb + segment) - except OSError: + except OSError, error: # probably just nonexistent segment index file - # TODO: make sure it's an EEXIST - pass + if error.errno != errno.ENOENT: raise # First write the much simpler filename/fileid dictionaries dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids} open(self.indexdb+'-','wb').write(zlib.compress(marshal.dumps(dbfil))) # The hard part is splitting the word dictionary up, of course - letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ#" + letters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_" segdicts = {} # Need batch of empty dicts for segment in letters: segdicts[segment] = {} @@ -280,6 +306,9 @@ class Indexer: pickle_fh.write(zlib.compress(pickle_str)) os.chmod(filename, 0664) + # save done + self.changed = 0 + def purge_entry(self, identifier): ''' Remove a file from file index and word index ''' @@ -295,12 +324,36 @@ class Indexer: if occurs.has_key(file_index): del occurs[file_index] + # save needed + self.changed = 1 + def index_loaded(self): return (hasattr(self,'fileids') and hasattr(self,'files') and hasattr(self,'words')) # #$Log: not supported by cvs2svn $ +#Revision 1.9 2002/07/14 06:11:16 richard +#Some TODOs +# +#Revision 1.8 2002/07/09 21:53:38 gmcm +#Optimize Class.find so that the propspec can contain a set of ids to match. +#This is used by indexer.search so it can do just one find for all the index matches. +#This was already confusing code, but for common terms (lots of index matches), +#it is enormously faster. +# +#Revision 1.7 2002/07/09 21:38:43 richard +#Only save the index if the thing is loaded and changed. Also, don't load +#the index just for a save. +# +#Revision 1.6 2002/07/09 04:26:44 richard +#We're indexing numbers now, and _underscore words +# +#Revision 1.5 2002/07/09 04:19:09 richard +#Added reindex command to roundup-admin. +#Fixed reindex on first access. +#Also fixed reindexing of entries that change. +# #Revision 1.4 2002/07/09 03:02:52 richard #More indexer work: #- all String properties may now be indexed too. Currently there's a bit of