X-Git-Url: https://git.tokkee.org/?a=blobdiff_plain;f=roundup%2Findexer.py;h=e4d1d59b02c186e522b9b3addc707bd52ffe7c6a;hb=67c915cde8cbede3c014669bd4281fc2a050e9f0;hp=8b2f61562dda3634755f7e1784fc3886c8fb5cdf;hpb=6ed6a9519397504546d46411b3bfca7d196370b1;p=roundup.git diff --git a/roundup/indexer.py b/roundup/indexer.py index 8b2f615..e4d1d59 100644 --- a/roundup/indexer.py +++ b/roundup/indexer.py @@ -14,134 +14,73 @@ # that promote freedom, but obviously am giving up any rights # to compel such. # -#$Id: indexer.py,v 1.3 2002-07-08 06:58:15 richard Exp $ +#$Id: indexer.py,v 1.14 2002-09-25 05:06:14 richard Exp $ ''' This module provides an indexer class, RoundupIndexer, that stores text indices in a roundup instance. This class makes searching the content of -messages and text files possible. +messages, string properties and text files possible. ''' import os, shutil, re, mimetypes, marshal, zlib, errno +from hyperdb import Link, Multilink class Indexer: - ''' Indexes messages and files. - - This implements a new splitter based on re.findall '\w+' and the - add_othertext method. + ''' Indexes information from roundup's hyperdb to allow efficient + searching. + + Three structures are created by the indexer: + files {identifier: (fileid, wordcount)} + words {word: {fileid: count}} + fileids {fileid: identifier} + where identifier is (classname, nodeid, propertyname) ''' def __init__(self, db_path): - indexdb_path = os.path.join(db_path, 'indexes') - - # see if we need to reindex because of a change in code - if (os.path.exists(indexdb_path) and - not os.path.exists(os.path.join(indexdb_path, 'version'))): - shutil.rmtree(indexdb_path) - - # see if the index exists - index_exists = 0 - if not os.path.exists(indexdb_path): - os.makedirs(indexdb_path) - os.chmod(indexdb_path, 0775) - open(os.path.join(indexdb_path, 'version'), 'w').write('1\n') - else: - index_exists = 1 - - # save off the path to the indexdb - self.indexdb = os.path.join(indexdb_path, 'index.db') + self.indexdb_path = os.path.join(db_path, 'indexes') + self.indexdb = os.path.join(self.indexdb_path, 'index.db') self.reindex = 0 - self.casesensitive = 0 self.quiet = 9 + self.changed = 0 - if not index_exists: - # index everything - files_path = os.path.join(db_path, 'files') - self.add_files(dir=files_path) - self.save_index() - - # override add_files so it's a little smarter about file types - def add_files(self, dir): - if not hasattr(self, 'files'): - self.load_index() - os.path.walk(dir, self.walk_add_file, None) - # Rebuild the fileid index - self.fileids = {} - for fname in self.files.keys(): - fileid = self.files[fname][0] - self.fileids[fileid] = fname - - # override add_file so it can be a little smarter about determining the - # file type - def walk_add_file(self, arg, dname, names, ftype=None): - for name in names: - name = os.path.join(dname, name) - if os.path.isfile(name): - self.add_file(name) - elif os.path.isdir(name): - os.path.walk(name, self.walk_add_file, None) - def add_file(self, fname, ftype=None): - ''' Index the contents of a regular file + # see if we need to reindex because of a change in code + version = os.path.join(self.indexdb_path, 'version') + if (not os.path.exists(self.indexdb_path) or + not os.path.exists(version)): + # for now the file itself is a flag + self.force_reindex() + elif os.path.exists(version): + version = open(version).read() + # check the value and reindex if it's not the latest + if version.strip() != '1': + self.force_reindex() + + def force_reindex(self): + '''Force a reindex condition ''' - if not hasattr(self, 'files'): - self.load_index() - # Is file eligible for (re)indexing? - if self.files.has_key(fname): - if self.reindex: - # Reindexing enabled, cleanup dicts - self.purge_entry(fname, self.files, self.words) - else: - # DO NOT reindex this file - if self.quiet < 5: - print "Skipping", fname - return 0 - - # guess the file type - if ftype is None: - ftype = mimetypes.guess_type(fname) - - # read in the file - text = open(fname).read() - if self.quiet < 5: print "Indexing", fname - words = self.splitter(text, ftype) - - # Find new file index, and assign it to filename - # (_TOP uses trick of negative to avoid conflict with file index) - self.files['_TOP'] = (self.files['_TOP'][0]-1, None) - file_index = abs(self.files['_TOP'][0]) - self.files[fname] = (file_index, len(words)) - - filedict = {} - for word in words: - if filedict.has_key(word): - filedict[word] = filedict[word]+1 - else: - filedict[word] = 1 - - for word in filedict.keys(): - if self.words.has_key(word): - entry = self.words[word] - else: - entry = {} - entry[file_index] = filedict[word] - self.words[word] = entry + if os.path.exists(self.indexdb_path): + shutil.rmtree(self.indexdb_path) + os.makedirs(self.indexdb_path) + os.chmod(self.indexdb_path, 0775) + open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n') + self.reindex = 1 + self.changed = 1 + + def should_reindex(self): + '''Should we reindex? + ''' + return self.reindex - # NOTE: this method signature deviates from the one specified in - # indexer - I'm not entirely sure where it was expected to the text - # from otherwise... - def add_othertext(self, identifier, text): - ''' Add some text associated with the identifier + def add_text(self, identifier, text, mime_type='text/plain'): + ''' Add some text associated with the (classname, nodeid, property) + identifier. ''' - # Is file eligible for (re)indexing? + # make sure the index is loaded + self.load_index() + + # remove old entries for this identifier if self.files.has_key(identifier): - # Reindexing enabled, cleanup dicts - if self.reindex: - self.purge_entry(identifier, self.files, self.words) - else: - # DO NOT reindex this file - if self.quiet < 5: - print "Not reindexing", identifier - return 0 + self.purge_entry(identifier) # split into words - words = self.splitter(text, 'text/plain') + words = self.splitter(text, mime_type) # Find new file index, and assign it to identifier # (_TOP uses trick of negative to avoid conflict with file index) @@ -171,21 +110,23 @@ class Indexer: # make a reference to the file for this word entry[file_index] = filedict[word] + # save needed + self.changed = 1 + def splitter(self, text, ftype): ''' Split the contents of a text string into a list of 'words' ''' - if ftype in ('text/plain', 'message/rfc822'): - words = self.text_splitter(text, self.casesensitive) + if ftype == 'text/plain': + words = self.text_splitter(text) else: return [] return words - def text_splitter(self, text, casesensitive=0): + def text_splitter(self, text): """Split text/plain string into a list of words """ - # Let's adjust case if not case-sensitive - if not casesensitive: - text = text.upper() + # case insensitive + text = text.upper() # Split the raw text, losing anything longer than 25 characters # since that'll be gibberish (encoded text or somesuch) or shorter @@ -193,37 +134,69 @@ class Indexer: # place return re.findall(r'\b\w{2,25}\b', text) - def search(self, search_terms, klass): - ''' display search results + def search(self, search_terms, klass, ignore={}, + dre=re.compile(r'([^\d]+)(\d+)')): + ''' Display search results looking for [search, terms] associated + with the hyperdb Class "klass". Ignore hits on {class: property}. + + "dre" is a helper, not an argument. ''' + # do the index lookup hits = self.find(search_terms) - links = [] - nodeids = {} - designator_propname = {'msg': 'messages', 'file': 'files'} - if hits: - hitcount = len(hits) - # build a dictionary of nodes and their associated messages - # and files - for hit in hits.keys(): - filename = hits[hit].split('/')[-1] - for designator, propname in designator_propname.items(): - if not filename.startswith(designator): - continue - nodeid = filename[len(designator):] - result = apply(klass.find, (), {propname:nodeid}) - if not result: - continue - - id = str(result[0]) - if not nodeids.has_key(id): - nodeids[id] = {} - - node_dict = nodeids[id] - if not node_dict.has_key(propname): - node_dict[propname] = [nodeid] - elif node_dict.has_key(propname): - node_dict[propname].append(nodeid) + if not hits: + return {} + + #designator_propname = {'msg': 'messages', 'file': 'files'} + designator_propname = {} + for nm, propclass in klass.getprops().items(): + if isinstance(propclass, Link) or isinstance(propclass, Multilink): + designator_propname[propclass.classname] = nm + + # build a dictionary of nodes and their associated messages + # and files + nodeids = {} # this is the answer + propspec = {} # used to do the klass.find + for propname in designator_propname.values(): + propspec[propname] = {} # used as a set (value doesn't matter) + for classname, nodeid, property in hits.values(): + # skip this result if we don't care about this class/property + if ignore.has_key((classname, property)): + continue + + # if it's a property on klass, it's easy + if classname == klass.classname: + if not nodeids.has_key(nodeid): + nodeids[nodeid] = {} + continue + + # make sure the class is a linked one, otherwise ignore + if not designator_propname.has_key(classname): + continue + # it's a linked class - set up to do the klass.find + linkprop = designator_propname[classname] # eg, msg -> messages + propspec[linkprop][nodeid] = 1 + + # retain only the meaningful entries + for propname, idset in propspec.items(): + if not idset: + del propspec[propname] + + # klass.find tells me the klass nodeids the linked nodes relate to + for resid in klass.find(**propspec): + resid = str(resid) + if not nodeids.has_key(id): + nodeids[resid] = {} + node_dict = nodeids[resid] + # now figure out where it came from + for linkprop in propspec.keys(): + for nodeid in klass.get(resid, linkprop): + if propspec[linkprop].has_key(nodeid): + # OK, this node[propname] has a winner + if not node_dict.has_key(linkprop): + node_dict[linkprop] = [nodeid] + else: + node_dict[linkprop].append(nodeid) return nodeids # we override this to ignore not 2 < word < 25 and also to fix a bug - @@ -240,8 +213,7 @@ class Indexer: if not 2 < len(word) < 25: # word outside the bounds of what we index - ignore continue - if not self.casesensitive: - word = word.upper() + word = word.upper() entry = self.words.get(word) # For each word, get index entries[word] = entry # of matching files if not entry: # Nothing for this one word (fail) @@ -259,7 +231,7 @@ class Indexer: return {} return hits - segments = "ABCDEFGHIJKLMNOPQRSTUVWXYZ#-!" + segments = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_-!" def load_index(self, reload=0, wordlist=None): # Unless reload is indicated, do not load twice if self.index_loaded() and not reload: @@ -281,8 +253,8 @@ class Indexer: try: f = open(self.indexdb + segment, 'rb') except IOError, error: - if error.errno != errno.ENOENT: - raise + # probably just nonexistent segment index file + if error.errno != errno.ENOENT: raise else: pickle_str = zlib.decompress(f.read()) f.close() @@ -301,23 +273,27 @@ class Indexer: self.words = db['WORDS'] self.files = db['FILES'] self.fileids = db['FILEIDS'] + self.changed = 0 def save_index(self): + # only save if the index is loaded and changed + if not self.index_loaded() or not self.changed: + return + # brutal space saver... delete all the small segments for segment in self.segments: try: os.remove(self.indexdb + segment) - except OSError: + except OSError, error: # probably just nonexistent segment index file - # TODO: make sure it's an EEXIST - pass + if error.errno != errno.ENOENT: raise # First write the much simpler filename/fileid dictionaries dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids} open(self.indexdb+'-','wb').write(zlib.compress(marshal.dumps(dbfil))) # The hard part is splitting the word dictionary up, of course - letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ#" + letters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_" segdicts = {} # Need batch of empty dicts for segment in letters: segdicts[segment] = {} @@ -334,43 +310,31 @@ class Indexer: pickle_fh.write(zlib.compress(pickle_str)) os.chmod(filename, 0664) - def purge_entry(self, fname, file_dct, word_dct): + # save done + self.changed = 0 + + def purge_entry(self, identifier): ''' Remove a file from file index and word index ''' - try: # The easy part, cleanup the file index - file_index = file_dct[fname] - del file_dct[fname] - except KeyError: - pass # We'll assume we only encounter KeyError's + self.load_index() + + if not self.files.has_key(identifier): + return + + file_index = self.files[identifier][0] + del self.files[identifier] + del self.fileids[file_index] + # The much harder part, cleanup the word index - for word, occurs in word_dct.items(): + for key, occurs in self.words.items(): if occurs.has_key(file_index): del occurs[file_index] - word_dct[word] = occurs + + # save needed + self.changed = 1 def index_loaded(self): return (hasattr(self,'fileids') and hasattr(self,'files') and hasattr(self,'words')) -# -#$Log: not supported by cvs2svn $ -#Revision 1.2 2002/05/25 07:16:24 rochecompaan -#Merged search_indexing-branch with HEAD -# -#Revision 1.1.2.3 2002/05/02 11:52:12 rochecompaan -#Fixed small bug that prevented indexes from being generated. -# -#Revision 1.1.2.2 2002/04/19 19:54:42 rochecompaan -#cgi_client.py -# removed search link for the time being -# moved rendering of matches to htmltemplate -#hyperdb.py -# filtering of nodes on full text search incorporated in filter method -#roundupdb.py -# added paramater to call of filter method -#roundup_indexer.py -# added search method to RoundupIndexer class -# -#Revision 1.1.2.1 2002/04/03 11:55:57 rochecompaan -# . Added feature #526730 - search for messages capability -# +# vim: set filetype=python ts=4 sw=4 et si