diff --git a/roundup/indexer.py b/roundup/indexer.py
index 8b2f61562dda3634755f7e1784fc3886c8fb5cdf..4cd5d24ad1b763e5571f1d4bc25233a22715594d 100644 (file)
--- a/roundup/indexer.py
+++ b/roundup/indexer.py
# that promote freedom, but obviously am giving up any rights
# to compel such.
#
-#$Id: indexer.py,v 1.3 2002-07-08 06:58:15 richard Exp $
-'''
-This module provides an indexer class, RoundupIndexer, that stores text
+#$Id: indexer.py,v 1.18 2004-02-11 23:55:08 richard Exp $
+'''This module provides an indexer class, RoundupIndexer, that stores text
indices in a roundup instance. This class makes searching the content of
-messages and text files possible.
+messages, string properties and text files possible.
'''
+__docformat__ = 'restructuredtext'
+
import os, shutil, re, mimetypes, marshal, zlib, errno
+from hyperdb import Link, Multilink
class Indexer:
- ''' Indexes messages and files.
+ '''Indexes information from roundup's hyperdb to allow efficient
+ searching.
- This implements a new splitter based on re.findall '\w+' and the
- add_othertext method.
- '''
- def __init__(self, db_path):
- indexdb_path = os.path.join(db_path, 'indexes')
+ Three structures are created by the indexer::
- # see if we need to reindex because of a change in code
- if (os.path.exists(indexdb_path) and
- not os.path.exists(os.path.join(indexdb_path, 'version'))):
- shutil.rmtree(indexdb_path)
-
- # see if the index exists
- index_exists = 0
- if not os.path.exists(indexdb_path):
- os.makedirs(indexdb_path)
- os.chmod(indexdb_path, 0775)
- open(os.path.join(indexdb_path, 'version'), 'w').write('1\n')
- else:
- index_exists = 1
+ files {identifier: (fileid, wordcount)}
+ words {word: {fileid: count}}
+ fileids {fileid: identifier}
- # save off the path to the indexdb
- self.indexdb = os.path.join(indexdb_path, 'index.db')
+ where identifier is (classname, nodeid, propertyname)
+ '''
+ def __init__(self, db_path):
+ self.indexdb_path = os.path.join(db_path, 'indexes')
+ self.indexdb = os.path.join(self.indexdb_path, 'index.db')
self.reindex = 0
- self.casesensitive = 0
self.quiet = 9
+ self.changed = 0
- if not index_exists:
- # index everything
- files_path = os.path.join(db_path, 'files')
- self.add_files(dir=files_path)
- self.save_index()
-
- # override add_files so it's a little smarter about file types
- def add_files(self, dir):
- if not hasattr(self, 'files'):
- self.load_index()
- os.path.walk(dir, self.walk_add_file, None)
- # Rebuild the fileid index
- self.fileids = {}
- for fname in self.files.keys():
- fileid = self.files[fname][0]
- self.fileids[fileid] = fname
-
- # override add_file so it can be a little smarter about determining the
- # file type
- def walk_add_file(self, arg, dname, names, ftype=None):
- for name in names:
- name = os.path.join(dname, name)
- if os.path.isfile(name):
- self.add_file(name)
- elif os.path.isdir(name):
- os.path.walk(name, self.walk_add_file, None)
- def add_file(self, fname, ftype=None):
- ''' Index the contents of a regular file
+ # see if we need to reindex because of a change in code
+ version = os.path.join(self.indexdb_path, 'version')
+ if (not os.path.exists(self.indexdb_path) or
+ not os.path.exists(version)):
+ # for now the file itself is a flag
+ self.force_reindex()
+ elif os.path.exists(version):
+ version = open(version).read()
+ # check the value and reindex if it's not the latest
+ if version.strip() != '1':
+ self.force_reindex()
+
+ def force_reindex(self):
+ '''Force a reindex condition
'''
- if not hasattr(self, 'files'):
- self.load_index()
- # Is file eligible for (re)indexing?
- if self.files.has_key(fname):
- if self.reindex:
- # Reindexing enabled, cleanup dicts
- self.purge_entry(fname, self.files, self.words)
- else:
- # DO NOT reindex this file
- if self.quiet < 5:
- print "Skipping", fname
- return 0
-
- # guess the file type
- if ftype is None:
- ftype = mimetypes.guess_type(fname)
-
- # read in the file
- text = open(fname).read()
- if self.quiet < 5: print "Indexing", fname
- words = self.splitter(text, ftype)
-
- # Find new file index, and assign it to filename
- # (_TOP uses trick of negative to avoid conflict with file index)
- self.files['_TOP'] = (self.files['_TOP'][0]-1, None)
- file_index = abs(self.files['_TOP'][0])
- self.files[fname] = (file_index, len(words))
-
- filedict = {}
- for word in words:
- if filedict.has_key(word):
- filedict[word] = filedict[word]+1
- else:
- filedict[word] = 1
-
- for word in filedict.keys():
- if self.words.has_key(word):
- entry = self.words[word]
- else:
- entry = {}
- entry[file_index] = filedict[word]
- self.words[word] = entry
+ if os.path.exists(self.indexdb_path):
+ shutil.rmtree(self.indexdb_path)
+ os.makedirs(self.indexdb_path)
+ os.chmod(self.indexdb_path, 0775)
+ open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n')
+ self.reindex = 1
+ self.changed = 1
+
+ def should_reindex(self):
+ '''Should we reindex?
+ '''
+ return self.reindex
- # NOTE: this method signature deviates from the one specified in
- # indexer - I'm not entirely sure where it was expected to the text
- # from otherwise...
- def add_othertext(self, identifier, text):
- ''' Add some text associated with the identifier
+ def add_text(self, identifier, text, mime_type='text/plain'):
+ '''Add some text associated with the (classname, nodeid, property)
+ identifier.
'''
- # Is file eligible for (re)indexing?
+ # make sure the index is loaded
+ self.load_index()
+
+ # remove old entries for this identifier
if self.files.has_key(identifier):
- # Reindexing enabled, cleanup dicts
- if self.reindex:
- self.purge_entry(identifier, self.files, self.words)
- else:
- # DO NOT reindex this file
- if self.quiet < 5:
- print "Not reindexing", identifier
- return 0
+ self.purge_entry(identifier)
# split into words
- words = self.splitter(text, 'text/plain')
+ words = self.splitter(text, mime_type)
# Find new file index, and assign it to identifier
# (_TOP uses trick of negative to avoid conflict with file index)
# make a reference to the file for this word
entry[file_index] = filedict[word]
+ # save needed
+ self.changed = 1
+
def splitter(self, text, ftype):
- ''' Split the contents of a text string into a list of 'words'
+ '''Split the contents of a text string into a list of 'words'
'''
- if ftype in ('text/plain', 'message/rfc822'):
- words = self.text_splitter(text, self.casesensitive)
+ if ftype == 'text/plain':
+ words = self.text_splitter(text)
else:
return []
return words
- def text_splitter(self, text, casesensitive=0):
+ def text_splitter(self, text):
"""Split text/plain string into a list of words
"""
- # Let's adjust case if not case-sensitive
- if not casesensitive:
- text = text.upper()
+ # case insensitive
+ text = text.upper()
# Split the raw text, losing anything longer than 25 characters
# since that'll be gibberish (encoded text or somesuch) or shorter
# place
return re.findall(r'\b\w{2,25}\b', text)
- def search(self, search_terms, klass):
- ''' display search results
+ def search(self, search_terms, klass, ignore={},
+ dre=re.compile(r'([^\d]+)(\d+)')):
+ '''Display search results looking for [search, terms] associated
+ with the hyperdb Class "klass". Ignore hits on {class: property}.
+
+ "dre" is a helper, not an argument.
'''
+ # do the index lookup
hits = self.find(search_terms)
- links = []
- nodeids = {}
- designator_propname = {'msg': 'messages', 'file': 'files'}
- if hits:
- hitcount = len(hits)
- # build a dictionary of nodes and their associated messages
- # and files
- for hit in hits.keys():
- filename = hits[hit].split('/')[-1]
- for designator, propname in designator_propname.items():
- if not filename.startswith(designator):
- continue
- nodeid = filename[len(designator):]
- result = apply(klass.find, (), {propname:nodeid})
- if not result:
- continue
-
- id = str(result[0])
- if not nodeids.has_key(id):
- nodeids[id] = {}
-
- node_dict = nodeids[id]
- if not node_dict.has_key(propname):
- node_dict[propname] = [nodeid]
- elif node_dict.has_key(propname):
- node_dict[propname].append(nodeid)
+ if not hits:
+ return {}
+
+ designator_propname = {}
+ for nm, propclass in klass.getprops().items():
+ if isinstance(propclass, Link) or isinstance(propclass, Multilink):
+ designator_propname[propclass.classname] = nm
+
+ # build a dictionary of nodes and their associated messages
+ # and files
+ nodeids = {} # this is the answer
+ propspec = {} # used to do the klass.find
+ for propname in designator_propname.values():
+ propspec[propname] = {} # used as a set (value doesn't matter)
+ for classname, nodeid, property in hits.values():
+ # skip this result if we don't care about this class/property
+ if ignore.has_key((classname, property)):
+ continue
+
+ # if it's a property on klass, it's easy
+ if classname == klass.classname:
+ if not nodeids.has_key(nodeid):
+ nodeids[nodeid] = {}
+ continue
+ # make sure the class is a linked one, otherwise ignore
+ if not designator_propname.has_key(classname):
+ continue
+
+ # it's a linked class - set up to do the klass.find
+ linkprop = designator_propname[classname] # eg, msg -> messages
+ propspec[linkprop][nodeid] = 1
+
+ # retain only the meaningful entries
+ for propname, idset in propspec.items():
+ if not idset:
+ del propspec[propname]
+
+ # klass.find tells me the klass nodeids the linked nodes relate to
+ for resid in klass.find(**propspec):
+ resid = str(resid)
+ if not nodeids.has_key(id):
+ nodeids[resid] = {}
+ node_dict = nodeids[resid]
+ # now figure out where it came from
+ for linkprop in propspec.keys():
+ for nodeid in klass.get(resid, linkprop):
+ if propspec[linkprop].has_key(nodeid):
+ # OK, this node[propname] has a winner
+ if not node_dict.has_key(linkprop):
+ node_dict[linkprop] = [nodeid]
+ else:
+ node_dict[linkprop].append(nodeid)
return nodeids
# we override this to ignore not 2 < word < 25 and also to fix a bug -
# the (fail) case.
def find(self, wordlist):
- ''' Locate files that match ALL the words in wordlist
+ '''Locate files that match ALL the words in wordlist
'''
if not hasattr(self, 'words'):
self.load_index()
if not 2 < len(word) < 25:
# word outside the bounds of what we index - ignore
continue
- if not self.casesensitive:
- word = word.upper()
+ word = word.upper()
entry = self.words.get(word) # For each word, get index
entries[word] = entry # of matching files
if not entry: # Nothing for this one word (fail)
if hits is None:
hits = {}
for k in entry.keys():
+ if not self.fileids.has_key(k):
+ raise ValueError, 'Index is corrupted: re-generate it'
hits[k] = self.fileids[k]
else:
# Eliminate hits for every non-match
return {}
return hits
- segments = "ABCDEFGHIJKLMNOPQRSTUVWXYZ#-!"
+ segments = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_-!"
def load_index(self, reload=0, wordlist=None):
# Unless reload is indicated, do not load twice
if self.index_loaded() and not reload:
try:
f = open(self.indexdb + segment, 'rb')
except IOError, error:
- if error.errno != errno.ENOENT:
- raise
+ # probably just nonexistent segment index file
+ if error.errno != errno.ENOENT: raise
else:
pickle_str = zlib.decompress(f.read())
f.close()
self.words = db['WORDS']
self.files = db['FILES']
self.fileids = db['FILEIDS']
+ self.changed = 0
def save_index(self):
+ # only save if the index is loaded and changed
+ if not self.index_loaded() or not self.changed:
+ return
+
# brutal space saver... delete all the small segments
for segment in self.segments:
try:
os.remove(self.indexdb + segment)
- except OSError:
+ except OSError, error:
# probably just nonexistent segment index file
- # TODO: make sure it's an EEXIST
- pass
+ if error.errno != errno.ENOENT: raise
# First write the much simpler filename/fileid dictionaries
dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids}
open(self.indexdb+'-','wb').write(zlib.compress(marshal.dumps(dbfil)))
# The hard part is splitting the word dictionary up, of course
- letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ#"
+ letters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_"
segdicts = {} # Need batch of empty dicts
for segment in letters:
segdicts[segment] = {}
pickle_fh.write(zlib.compress(pickle_str))
os.chmod(filename, 0664)
- def purge_entry(self, fname, file_dct, word_dct):
- ''' Remove a file from file index and word index
+ # save done
+ self.changed = 0
+
+ def purge_entry(self, identifier):
+ '''Remove a file from file index and word index
'''
- try: # The easy part, cleanup the file index
- file_index = file_dct[fname]
- del file_dct[fname]
- except KeyError:
- pass # We'll assume we only encounter KeyError's
+ self.load_index()
+
+ if not self.files.has_key(identifier):
+ return
+
+ file_index = self.files[identifier][0]
+ del self.files[identifier]
+ del self.fileids[file_index]
+
# The much harder part, cleanup the word index
- for word, occurs in word_dct.items():
+ for key, occurs in self.words.items():
if occurs.has_key(file_index):
del occurs[file_index]
- word_dct[word] = occurs
+
+ # save needed
+ self.changed = 1
def index_loaded(self):
return (hasattr(self,'fileids') and hasattr(self,'files') and
hasattr(self,'words'))
-#
-#$Log: not supported by cvs2svn $
-#Revision 1.2 2002/05/25 07:16:24 rochecompaan
-#Merged search_indexing-branch with HEAD
-#
-#Revision 1.1.2.3 2002/05/02 11:52:12 rochecompaan
-#Fixed small bug that prevented indexes from being generated.
-#
-#Revision 1.1.2.2 2002/04/19 19:54:42 rochecompaan
-#cgi_client.py
-# removed search link for the time being
-# moved rendering of matches to htmltemplate
-#hyperdb.py
-# filtering of nodes on full text search incorporated in filter method
-#roundupdb.py
-# added paramater to call of filter method
-#roundup_indexer.py
-# added search method to RoundupIndexer class
-#
-#Revision 1.1.2.1 2002/04/03 11:55:57 rochecompaan
-# . Added feature #526730 - search for messages capability
-#
+# vim: set filetype=python ts=4 sw=4 et si