diff --git a/roundup/indexer.py b/roundup/indexer.py
index 47f0120366129ba431368788dffcb4222e53dd40..74bfcafb9e3176ed17e632e75866354b82d18049 100644 (file)
--- a/roundup/indexer.py
+++ b/roundup/indexer.py
-#!/usr/bin/env python
-
-"""Create full-text indexes and search them
-
-Notes:
-
- See http://gnosis.cx/publish/programming/charming_python_15.txt
- for a detailed discussion of this module.
-
- This version requires Python 1.6+. It turns out that the use
- of string methods rather than [string] module functions is
- enough faster in a tight loop so as to provide a quite
- remarkable 25% speedup in overall indexing. However, only FOUR
- lines in TextSplitter.text_splitter() were changed away from
- Python 1.5 compatibility. Those lines are followed by comments
- beginning with "# 1.52: " that show the old forms. Python
- 1.5 users can restore these lines, and comment out those just
- above them.
-
-Classes:
-
- GenericIndexer -- Abstract class
- TextSplitter -- Mixin class
- Index
- ShelveIndexer
- FlatIndexer
- XMLPickleIndexer
- PickleIndexer
- ZPickleIndexer
- SlicedZPickleIndexer
-
-Functions:
-
- echo_fname(fname)
- recurse_files(...)
-
-Index Formats:
-
- *Indexer.files: filename --> (fileid, wordcount)
- *Indexer.fileids: fileid --> filename
- *Indexer.words: word --> {fileid1:occurs, fileid2:occurs, ...}
-
-Module Usage:
-
- There are a few ways to use this module. Just to utilize existing
- functionality, something like the following is a likely
- pattern:
-
- import gnosis.indexer as indexer
- index = indexer.MyFavoriteIndexer() # For some concrete Indexer
- index.load_index('myIndex.db')
- index.add_files(dir='/this/that/otherdir', pattern='*.txt')
- hits = index.find(['spam','eggs','bacon'])
- index.print_report(hits)
-
- To customize the basic classes, something like the following is likely:
-
- class MySplitter:
- def splitter(self, text, ftype):
- "Peform much better splitting than default (for filetypes)"
- # ...
- return words
-
- class MyIndexer(indexer.GenericIndexer, MySplitter):
- def load_index(self, INDEXDB=None):
- "Retrieve three dictionaries from clever storage method"
- # ...
- self.words, self.files, self.fileids = WORDS, FILES, FILEIDS
- def save_index(self, INDEXDB=None):
- "Save three dictionaries to clever storage method"
-
- index = MyIndexer()
- # ...etc...
-
-Benchmarks:
-
- As we know, there are lies, damn lies, and benchmarks. Take
- the below with an adequate dose of salt. In version 0.10 of
- the concrete indexers, some performance was tested. The
- test case was a set of mail/news archives, that were about
- 43 mB, and 225 files. In each case, an index was generated
- (if possible), and a search for the words "xml python" was
- performed.
-
- - Index w/ PickleIndexer: 482s, 2.4 mB
- - Search w/ PickleIndexer: 1.74s
- - Index w/ ZPickleIndexer: 484s, 1.2 mB
- - Search w/ ZPickleIndexer: 1.77s
- - Index w/ FlatIndexer: 492s, 2.6 mB
- - Search w/ FlatIndexer: 53s
- - Index w/ ShelveIndexer: (dumbdbm) Many minutes, tens of mBs
- - Search w/ ShelveIndexer: Aborted before completely indexed
- - Index w/ ShelveIndexer: (dbhash) Long time (partial crash), 10 mB
- - Search w/ ShelveIndexer: N/A. Too many glitches
- - Index w/ XMLPickleIndexer: Memory error (xml_pickle uses bad string
- composition for large output)
- - Search w/ XMLPickleIndexer: N/A
- - grep search (xml|python): 20s (cached: <5s)
- - 'srch' utility (python): 12s
-"""
-#$Id: indexer.py,v 1.2 2002-05-25 07:16:24 rochecompaan Exp $
-
-__shell_usage__ = """
-Shell Usage: [python] indexer.py [options] [search_words]
-
- -h, /h, -?, /?, ?, --help: Show this help screen
- -index: Add files to index
- -reindex: Refresh files already in the index
- (can take much more time)
- -casesensitive: Maintain the case of indexed words
- (can lead to MUCH larger indices)
- -norecurse, -local: Only index starting dir, not subdirs
- -dir=<directory>: Starting directory for indexing
- (default is current directory)
- -indexdb=<database>: Use specified index database
- (environ variable INDEXER_DB is preferred)
- -regex=<pattern>: Index files matching regular expression
- -glob=<pattern>: Index files matching glob pattern
- -filter=<pattern> Only display results matching pattern
- -output=<op>, -format=<opt>: How much detail on matches?
- -<digit>: Quiet level (0=verbose ... 9=quiet)
-
-Output/format options are ALL/EVERYTHING/VERBOSE, RATINGS/SCORES,
-FILENAMES/NAMES/FILES, SUMMARY/REPORT"""
-
-__version__ = "$Revision: 1.2 $"
-__author__=["David Mertz (mertz@gnosis.cx)",]
-__thanks_to__=["Pat Knight (p.knight@ktgroup.co.uk)",
- "Gregory Popovitch (greg@gpy.com)", ]
-__copyright__="""
- This file is released to the public domain. I (dqm) would
- appreciate it if you choose to keep derived works under terms
- that promote freedom, but obviously am giving up any rights
- to compel such.
-"""
-
-__history__="""
- 0.1 Initial version.
-
- 0.11 Tweaked TextSplitter after some random experimentation.
-
- 0.12 Added SlicedZPickleIndexer (best choice, so far).
-
- 0.13 Pat Knight pointed out need for binary open()'s of
- certain files under Windows.
-
- 0.14 Added '-filter' switch to search results.
-
- 0.15 Added direct read of gzip files
-
- 0.20 Gregory Popovitch did some profiling on TextSplitter,
- and provided both huge speedups to the Python version
- and hooks to a C extension class (ZopeTextSplitter).
- A little refactoring by he and I (dqm) has nearly
- doubled the speed of indexing
-
- 0.30 Module refactored into gnosis package. This is a
- first pass, and various documentation and test cases
- should be added later.
-"""
-import string, re, os, fnmatch, sys, copy, gzip
-from types import *
-
-#-- Silly "do nothing" default recursive file processor
-def echo_fname(fname): print fname
-
-#-- "Recurse and process files" utility function
-def recurse_files(curdir, pattern, exclusions, func=echo_fname, *args, **kw):
- "Recursively process file pattern"
- subdirs, files = [],[]
- level = kw.get('level',0)
-
- for name in os.listdir(curdir):
- fname = os.path.join(curdir, name)
- if name[-4:] in exclusions:
- pass # do not include binary file type
- elif os.path.isdir(fname) and not os.path.islink(fname):
- subdirs.append(fname)
- # kludge to detect a regular expression across python versions
- elif sys.version[0]=='1' and isinstance(pattern, re.RegexObject):
- if pattern.match(name):
- files.append(fname)
- elif sys.version[0]=='2' and type(pattern)==type(re.compile('')):
- if pattern.match(name):
- files.append(fname)
- elif type(pattern) is StringType:
- if fnmatch.fnmatch(name, pattern):
- files.append(fname)
-
- for fname in files:
- apply(func, (fname,)+args)
- for subdir in subdirs:
- recurse_files(subdir, pattern, exclusions, func, level=level+1)
-
-#-- Data bundle for index dictionaries
-class Index:
- def __init__(self, words, files, fileids):
- if words is not None: self.WORDS = words
- if files is not None: self.FILES = files
- if fileids is not None: self.FILEIDS = fileids
-
-#-- "Split plain text into words" utility function
-class TextSplitter:
- def initSplitter(self):
- prenum = string.join(map(chr, range(0,48)), '')
- num2cap = string.join(map(chr, range(58,65)), '')
- cap2low = string.join(map(chr, range(91,97)), '')
- postlow = string.join(map(chr, range(123,256)), '')
- nonword = prenum + num2cap + cap2low + postlow
- self.word_only = string.maketrans(nonword, " "*len(nonword))
- self.nondigits = string.join(map(chr, range(0,48)) + map(chr, range(58,255)), '')
- self.alpha = string.join(map(chr, range(65,91)) + map(chr, range(97,123)), '')
- self.ident = string.join(map(chr, range(256)), '')
- self.init = 1
-
- def splitter(self, text, ftype):
- "Split the contents of a text string into a list of 'words'"
- if ftype == 'text/plain':
- words = self.text_splitter(text, self.casesensitive)
- else:
- raise NotImplementedError
- return words
-
- def text_splitter(self, text, casesensitive=0):
- """Split text/plain string into a list of words
-
- In version 0.20 this function is still fairly weak at
- identifying "real" words, and excluding gibberish
- strings. As long as the indexer looks at "real" text
- files, it does pretty well; but if indexing of binary
- data is attempted, a lot of gibberish gets indexed.
- Suggestions on improving this are GREATLY APPRECIATED.
- """
- # Initialize some constants
- if not hasattr(self,'init'): self.initSplitter()
-
- # Speedup trick: attributes into local scope
- word_only = self.word_only
- ident = self.ident
- alpha = self.alpha
- nondigits = self.nondigits
- translate = string.translate
-
- # Let's adjust case if not case-sensitive
- if not casesensitive: text = string.upper(text)
-
- # Split the raw text
- allwords = string.split(text)
-
- # Finally, let's skip some words not worth indexing
- words = []
- for word in allwords:
- if len(word) > 25: continue # too long (probably gibberish)
-
- # Identify common patterns in non-word data (binary, UU/MIME, etc)
- num_nonalpha = len(word.translate(ident, alpha))
- numdigits = len(word.translate(ident, nondigits))
- # 1.52: num_nonalpha = len(translate(word, ident, alpha))
- # 1.52: numdigits = len(translate(word, ident, nondigits))
- if numdigits > len(word)-2: # almost all digits
- if numdigits > 5: # too many digits is gibberish
- continue # a moderate number is year/zipcode/etc
- elif num_nonalpha*3 > len(word): # too much scattered nonalpha = gibberish
- continue
-
- word = word.translate(word_only) # Let's strip funny byte values
- # 1.52: word = translate(word, word_only)
- subwords = word.split() # maybe embedded non-alphanumeric
- # 1.52: subwords = string.split(word)
- for subword in subwords: # ...so we might have subwords
- if len(subword) <= 2: continue # too short a subword
- words.append(subword)
- return words
-
-class ZopeTextSplitter:
- def initSplitter(self):
- import Splitter
- stop_words=(
- 'am', 'ii', 'iii', 'per', 'po', 're', 'a', 'about', 'above', 'across',
- 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone',
- 'along', 'already', 'also', 'although', 'always', 'am', 'among',
- 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any',
- 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around',
- 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes',
- 'becoming', 'been', 'before', 'beforehand', 'behind', 'being',
- 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both',
- 'bottom', 'but', 'by', 'can', 'cannot', 'cant', 'con', 'could',
- 'couldnt', 'cry', 'describe', 'detail', 'do', 'done', 'down', 'due',
- 'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else',
- 'elsewhere', 'empty', 'enough', 'even', 'ever', 'every', 'everyone',
- 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty',
- 'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly',
- 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get',
- 'give', 'go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her',
- 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers',
- 'herself', 'him', 'himself', 'his', 'how', 'however', 'hundred', 'i',
- 'ie', 'if', 'in', 'inc', 'indeed', 'interest', 'into', 'is', 'it',
- 'its', 'itself', 'keep', 'last', 'latter', 'latterly', 'least',
- 'less', 'made', 'many', 'may', 'me', 'meanwhile', 'might', 'mill',
- 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must',
- 'my', 'myself', 'name', 'namely', 'neither', 'never', 'nevertheless',
- 'next', 'nine', 'no', 'nobody', 'none', 'noone', 'nor', 'not',
- 'nothing', 'now', 'nowhere', 'of', 'off', 'often', 'on', 'once',
- 'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'our',
- 'ours', 'ourselves', 'out', 'over', 'own', 'per', 'perhaps',
- 'please', 'pre', 'put', 'rather', 're', 'same', 'see', 'seem',
- 'seemed', 'seeming', 'seems', 'serious', 'several', 'she', 'should',
- 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some',
- 'somehow', 'someone', 'something', 'sometime', 'sometimes',
- 'somewhere', 'still', 'such', 'take', 'ten', 'than', 'that', 'the',
- 'their', 'them', 'themselves', 'then', 'thence', 'there',
- 'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these',
- 'they', 'thick', 'thin', 'third', 'this', 'those', 'though', 'three',
- 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too',
- 'toward', 'towards', 'twelve', 'twenty', 'two', 'un', 'under',
- 'until', 'up', 'upon', 'us', 'very', 'via', 'was', 'we', 'well',
- 'were', 'what', 'whatever', 'when', 'whence', 'whenever', 'where',
- 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon',
- 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever',
- 'whole', 'whom', 'whose', 'why', 'will', 'with', 'within', 'without',
- 'would', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves',
- )
- self.stop_word_dict={}
- for word in stop_words: self.stop_word_dict[word]=None
- self.splitterobj = Splitter.getSplitter()
- self.init = 1
-
- def goodword(self, word):
- return len(word) < 25
-
- def splitter(self, text, ftype):
- """never case-sensitive"""
- if not hasattr(self,'init'): self.initSplitter()
- return filter(self.goodword, self.splitterobj(text, self.stop_word_dict))
-
-
-#-- "Abstract" parent class for inherited indexers
-# (does not handle storage in parent, other methods are primitive)
-
-class GenericIndexer:
- def __init__(self, **kw):
- apply(self.configure, (), kw)
+#
+# This module is derived from the module described at:
+# http://gnosis.cx/publish/programming/charming_python_15.txt
+#
+# Author: David Mertz (mertz@gnosis.cx)
+# Thanks to: Pat Knight (p.knight@ktgroup.co.uk)
+# Gregory Popovitch (greg@gpy.com)
+#
+# The original module was released under this license, and remains under
+# it:
+#
+# This file is released to the public domain. I (dqm) would
+# appreciate it if you choose to keep derived works under terms
+# that promote freedom, but obviously am giving up any rights
+# to compel such.
+#
+#$Id: indexer.py,v 1.12 2002-07-19 03:36:33 richard Exp $
+'''
+This module provides an indexer class, RoundupIndexer, that stores text
+indices in a roundup instance. This class makes searching the content of
+messages, string properties and text files possible.
+'''
+import os, shutil, re, mimetypes, marshal, zlib, errno
+from hyperdb import Link, Multilink
+
+class Indexer:
+ ''' Indexes information from roundup's hyperdb to allow efficient
+ searching.
+
+ Three structures are created by the indexer:
+ files {identifier: (fileid, wordcount)}
+ words {word: {fileid: count}}
+ fileids {fileid: identifier}
+ where identifier is (classname, nodeid, propertyname)
+ '''
+ def __init__(self, db_path):
+ self.indexdb_path = os.path.join(db_path, 'indexes')
+ self.indexdb = os.path.join(self.indexdb_path, 'index.db')
+ self.reindex = 0
+ self.quiet = 9
+ self.changed = 0
+
+ # see if we need to reindex because of a change in code
+ version = os.path.join(self.indexdb_path, 'version')
+ if (not os.path.exists(self.indexdb_path) or
+ not os.path.exists(version)):
+ # for now the file itself is a flag
+ self.force_reindex()
+ elif os.path.exists(version):
+ version = open(version).read()
+ # check the value and reindex if it's not the latest
+ if version.strip() != '1':
+ self.force_reindex()
+
+ def force_reindex(self):
+ '''Force a reindex condition
+ '''
+ if os.path.exists(self.indexdb_path):
+ shutil.rmtree(self.indexdb_path)
+ os.makedirs(self.indexdb_path)
+ os.chmod(self.indexdb_path, 0775)
+ open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n')
+ self.reindex = 1
+ self.changed = 1
+
+ def should_reindex(self):
+ '''Should we reindex?
+ '''
+ return self.reindex
+
+ def add_text(self, identifier, text, mime_type='text/plain'):
+ ''' Add some text associated with the (classname, nodeid, property)
+ identifier.
+ '''
+ # make sure the index is loaded
+ self.load_index()
- def whoami(self):
- return self.__class__.__name__
+ # remove old entries for this identifier
+ if self.files.has_key(identifier):
+ self.purge_entry(identifier)
- def configure(self, REINDEX=0, CASESENSITIVE=0,
- INDEXDB=os.environ.get('INDEXER_DB', 'TEMP_NDX.DB'),
- ADD_PATTERN='*', QUIET=5):
- "Configure settings used by indexing and storage/retrieval"
- self.indexdb = INDEXDB
- self.reindex = REINDEX
- self.casesensitive = CASESENSITIVE
- self.add_pattern = ADD_PATTERN
- self.quiet = QUIET
- self.filter = None
+ # split into words
+ words = self.splitter(text, mime_type)
- def add_files(self, dir=os.getcwd(), pattern=None, descend=1):
- self.load_index()
- exclusions = ('.zip','.pyc','.gif','.jpg','.dat','.dir')
- if not pattern:
- pattern = self.add_pattern
- recurse_files(dir, pattern, exclusions, self.add_file)
- # Rebuild the fileid index
- self.fileids = {}
- for fname in self.files.keys():
- fileid = self.files[fname][0]
- self.fileids[fileid] = fname
-
- def add_file(self, fname, ftype='text/plain'):
- "Index the contents of a regular file"
- if self.files.has_key(fname): # Is file eligible for (re)indexing?
- if self.reindex: # Reindexing enabled, cleanup dicts
- self.purge_entry(fname, self.files, self.words)
- else: # DO NOT reindex this file
- if self.quiet < 5: print "Skipping", fname
- return 0
-
- # Read in the file (if possible)
- try:
- if fname[-3:] == '.gz':
- text = gzip.open(fname).read()
- else:
- text = open(fname).read()
- if self.quiet < 5: print "Indexing", fname
- except IOError:
- return 0
- words = self.splitter(text, ftype)
-
- # Find new file index, and assign it to filename
+ # Find new file index, and assign it to identifier
# (_TOP uses trick of negative to avoid conflict with file index)
self.files['_TOP'] = (self.files['_TOP'][0]-1, None)
- file_index = abs(self.files['_TOP'][0])
- self.files[fname] = (file_index, len(words))
+ file_index = abs(self.files['_TOP'][0])
+ self.files[identifier] = (file_index, len(words))
+ self.fileids[file_index] = identifier
+ # find the unique words
filedict = {}
for word in words:
if filedict.has_key(word):
else:
filedict[word] = 1
+ # now add to the totals
for word in filedict.keys():
+ # each word has a dict of {identifier: count}
if self.words.has_key(word):
entry = self.words[word]
else:
+ # new word
entry = {}
+ self.words[word] = entry
+
+ # make a reference to the file for this word
entry[file_index] = filedict[word]
- self.words[word] = entry
- def add_othertext(self, identifier):
- """Index a textual source other than a plain file
+ # save needed
+ self.changed = 1
- A child class might want to implement this method (or a similar one)
- in order to index textual sources such as SQL tables, URLs, clay
- tablets, or whatever else. The identifier should uniquely pick out
- the source of the text (whatever it is)
- """
- raise NotImplementedError
+ def splitter(self, text, ftype):
+ ''' Split the contents of a text string into a list of 'words'
+ '''
+ if ftype == 'text/plain':
+ words = self.text_splitter(text)
+ else:
+ return []
+ return words
- def save_index(self, INDEXDB=None):
- raise NotImplementedError
+ def text_splitter(self, text):
+ """Split text/plain string into a list of words
+ """
+ # case insensitive
+ text = text.upper()
+
+ # Split the raw text, losing anything longer than 25 characters
+ # since that'll be gibberish (encoded text or somesuch) or shorter
+ # than 3 characters since those short words appear all over the
+ # place
+ return re.findall(r'\b\w{2,25}\b', text)
+
+ def search(self, search_terms, klass, ignore={},
+ dre=re.compile(r'([^\d]+)(\d+)')):
+ ''' Display search results looking for [search, terms] associated
+ with the hyperdb Class "klass". Ignore hits on {class: property}.
+
+ "dre" is a helper, not an argument.
+ '''
+ # do the index lookup
+ hits = self.find(search_terms)
+ if not hits:
+ return {}
+
+ #designator_propname = {'msg': 'messages', 'file': 'files'}
+ designator_propname = {}
+ for nm, propclass in klass.getprops().items():
+ if isinstance(propclass, Link) or isinstance(propclass, Multilink):
+ designator_propname[propclass.classname] = nm
+
+ # build a dictionary of nodes and their associated messages
+ # and files
+ nodeids = {} # this is the answer
+ propspec = {} # used to do the klass.find
+ for propname in designator_propname.values():
+ propspec[propname] = {} # used as a set (value doesn't matter)
+ for classname, nodeid, property in hits.values():
+ # skip this result if we don't care about this class/property
+ if ignore.has_key((classname, property)):
+ continue
- def load_index(self, INDEXDB=None, reload=0, wordlist=None):
- raise NotImplementedError
+ # if it's a property on klass, it's easy
+ if classname == klass.classname:
+ if not nodeids.has_key(nodeid):
+ nodeids[nodeid] = {}
+ continue
- def find(self, wordlist, print_report=0):
- "Locate files that match ALL the words in wordlist"
+ # it's a linked class - set up to do the klass.find
+ linkprop = designator_propname[classname] # eg, msg -> messages
+ propspec[linkprop][nodeid] = 1
+
+ # retain only the meaningful entries
+ for propname, idset in propspec.items():
+ if not idset:
+ del propspec[propname]
+
+ # klass.find tells me the klass nodeids the linked nodes relate to
+ for resid in klass.find(**propspec):
+ resid = str(resid)
+ if not nodeids.has_key(id):
+ nodeids[resid] = {}
+ node_dict = nodeids[resid]
+ # now figure out where it came from
+ for linkprop in propspec.keys():
+ for nodeid in klass.get(resid, linkprop):
+ if propspec[linkprop].has_key(nodeid):
+ # OK, this node[propname] has a winner
+ if not node_dict.has_key(linkprop):
+ node_dict[linkprop] = [nodeid]
+ else:
+ node_dict[linkprop].append(nodeid)
+ return nodeids
+
+ # we override this to ignore not 2 < word < 25 and also to fix a bug -
+ # the (fail) case.
+ def find(self, wordlist):
+ ''' Locate files that match ALL the words in wordlist
+ '''
+ if not hasattr(self, 'words'):
+ self.load_index()
self.load_index(wordlist=wordlist)
entries = {}
- hits = copy.copy(self.fileids) # Copy of fileids index
+ hits = None
for word in wordlist:
- if not self.casesensitive:
- word = string.upper(word)
+ if not 2 < len(word) < 25:
+ # word outside the bounds of what we index - ignore
+ continue
+ word = word.upper()
entry = self.words.get(word) # For each word, get index
entries[word] = entry # of matching files
if not entry: # Nothing for this one word (fail)
- return 0
- for fileid in hits.keys(): # Eliminate hits for every non-match
- if not entry.has_key(fileid):
- del hits[fileid]
- if print_report:
- self.print_report(hits, wordlist, entries)
- return hits
-
- def print_report(self, hits={}, wordlist=[], entries={}):
- # Figure out what to actually print (based on QUIET level)
- output = []
- for fileid,fname in hits.items():
- message = fname
- if self.quiet <= 3:
- wordcount = self.files[fname][1]
- matches = 0
- countmess = '\n'+' '*13+`wordcount`+' words; '
- for word in wordlist:
- if not self.casesensitive:
- word = string.upper(word)
- occurs = entries[word][fileid]
- matches = matches+occurs
- countmess = countmess +`occurs`+' '+word+'; '
- message = string.ljust('[RATING: '
- +`1000*matches/wordcount`+']',13)+message
- if self.quiet <= 2: message = message +countmess +'\n'
- if self.filter: # Using an output filter
- if fnmatch.fnmatch(message, self.filter):
- output.append(message)
+ return {}
+ if hits is None:
+ hits = {}
+ for k in entry.keys():
+ hits[k] = self.fileids[k]
else:
- output.append(message)
-
- if self.quiet <= 5:
- print string.join(output,'\n')
- sys.stderr.write('\n'+`len(output)`+' files matched wordlist: '+
- `wordlist`+'\n')
- return output
-
- def purge_entry(self, fname, file_dct, word_dct):
- "Remove a file from file index and word index"
- try: # The easy part, cleanup the file index
- file_index = file_dct[fname]
- del file_dct[fname]
- except KeyError:
- pass # We'll assume we only encounter KeyError's
- # The much harder part, cleanup the word index
- for word, occurs in word_dct.items():
- if occurs.has_key(file_index):
- del occurs[file_index]
- word_dct[word] = occurs
+ # Eliminate hits for every non-match
+ for fileid in hits.keys():
+ if not entry.has_key(fileid):
+ del hits[fileid]
+ if hits is None:
+ return {}
+ return hits
- def index_loaded(self):
- return ( hasattr(self,'fileids') and
- hasattr(self,'files') and
- hasattr(self,'words') )
-
-#-- Provide an actual storage facility for the indexes (i.e. shelve)
-class ShelveIndexer(GenericIndexer, TextSplitter):
- """Concrete Indexer utilizing [shelve] for storage
-
- Unfortunately, [shelve] proves far too slow in indexing, while
- creating monstrously large indexes. Not recommend, at least under
- the default dbm's tested. Also, class may be broken because
- shelves do not, apparently, support the .values() and .items()
- methods. Fixing this is a low priority, but the sample code is
- left here.
- """
- def load_index(self, INDEXDB=None, reload=0, wordlist=None):
- INDEXDB = INDEXDB or self.indexdb
- import shelve
- self.words = shelve.open(INDEXDB+".WORDS")
- self.files = shelve.open(INDEXDB+".FILES")
- self.fileids = shelve.open(INDEXDB+".FILEIDS")
- if not FILES: # New index
- self.files['_TOP'] = (0,None)
-
- def save_index(self, INDEXDB=None):
- INDEXDB = INDEXDB or self.indexdb
- pass
-
-class FlatIndexer(GenericIndexer, TextSplitter):
- """Concrete Indexer utilizing flat-file for storage
-
- See the comments in the referenced article for details; in
- brief, this indexer has about the same timing as the best in
- -creating- indexes and the storage requirements are
- reasonable. However, actually -using- a flat-file index is
- more than an order of magnitude worse than the best indexer
- (ZPickleIndexer wins overall).
-
- On the other hand, FlatIndexer creates a wonderfully easy to
- parse database format if you have a reason to transport the
- index to a different platform or programming language. And
- should you perform indexing as part of a long-running
- process, the overhead of initial file parsing becomes
- irrelevant.
- """
- def load_index(self, INDEXDB=None, reload=0, wordlist=None):
+ segments = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_-!"
+ def load_index(self, reload=0, wordlist=None):
# Unless reload is indicated, do not load twice
- if self.index_loaded() and not reload: return 0
- # Ok, now let's actually load it
- INDEXDB = INDEXDB or self.indexdb
- self.words = {}
- self.files = {'_TOP':(0,None)}
- self.fileids = {}
- try: # Read index contents
- for line in open(INDEXDB).readlines():
- fields = string.split(line)
- if fields[0] == '-': # Read a file/fileid line
- fileid = eval(fields[2])
- wordcount = eval(fields[3])
- fname = fields[1]
- self.files[fname] = (fileid, wordcount)
- self.fileids[fileid] = fname
- else: # Read a word entry (dict of hits)
- entries = {}
- word = fields[0]
- for n in range(1,len(fields),2):
- fileid = eval(fields[n])
- occurs = eval(fields[n+1])
- entries[fileid] = occurs
- self.words[word] = entries
- except:
- pass # New index
-
- def save_index(self, INDEXDB=None):
- INDEXDB = INDEXDB or self.indexdb
- tab, lf, sp = '\t','\n',' '
- indexdb = open(INDEXDB,'w')
- for fname,entry in self.files.items():
- indexdb.write('- '+fname +tab +`entry[0]` +tab +`entry[1]` +lf)
- for word,entry in self.words.items():
- indexdb.write(word +tab+tab)
- for fileid,occurs in entry.items():
- indexdb.write(`fileid` +sp +`occurs` +sp)
- indexdb.write(lf)
-
-class PickleIndexer(GenericIndexer, TextSplitter):
- def load_index(self, INDEXDB=None, reload=0, wordlist=None):
- # Unless reload is indicated, do not load twice
- if self.index_loaded() and not reload: return 0
- # Ok, now let's actually load it
- import cPickle
- INDEXDB = INDEXDB or self.indexdb
- try:
- pickle_str = open(INDEXDB,'rb').read()
- db = cPickle.loads(pickle_str)
- except: # New index
- db = Index({}, {'_TOP':(0,None)}, {})
- self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS
-
- def save_index(self, INDEXDB=None):
- import cPickle
- INDEXDB = INDEXDB or self.indexdb
- db = Index(self.words, self.files, self.fileids)
- open(INDEXDB,'wb').write(cPickle.dumps(db, 1))
-
-class XMLPickleIndexer(PickleIndexer):
- """Concrete Indexer utilizing XML for storage
-
- While this is, as expected, a verbose format, the possibility
- of using XML as a transport format for indexes might be
- useful. However, [xml_pickle] is in need of some redesign to
- avoid gross inefficiency when creating very large
- (multi-megabyte) output files (fixed in [xml_pickle] version
- 0.48 or above)
- """
- def load_index(self, INDEXDB=None, reload=0, wordlist=None):
- # Unless reload is indicated, do not load twice
- if self.index_loaded() and not reload: return 0
- # Ok, now let's actually load it
- from gnosis.xml.pickle import XML_Pickler
- INDEXDB = INDEXDB or self.indexdb
- try: # XML file exists
- xml_str = open(INDEXDB).read()
- db = XML_Pickler().loads(xml_str)
- except: # New index
- db = Index({}, {'_TOP':(0,None)}, {})
- self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS
-
- def save_index(self, INDEXDB=None):
- from gnosis.xml.pickle import XML_Pickler
- INDEXDB = INDEXDB or self.indexdb
- db = Index(self.words, self.files, self.fileids)
- open(INDEXDB,'w').write(XML_Pickler(db).dumps())
-
-class ZPickleIndexer(PickleIndexer):
- def load_index(self, INDEXDB=None, reload=0, wordlist=None):
- # Unless reload is indicated, do not load twice
- if self.index_loaded() and not reload: return 0
- # Ok, now let's actually load it
- import cPickle, zlib
- INDEXDB = INDEXDB or self.indexdb
- try:
- pickle_str = zlib.decompress(open(INDEXDB+'!','rb').read())
- db = cPickle.loads(pickle_str)
- except: # New index
- db = Index({}, {'_TOP':(0,None)}, {})
- self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS
-
- def save_index(self, INDEXDB=None):
- import cPickle, zlib
- INDEXDB = INDEXDB or self.indexdb
- db = Index(self.words, self.files, self.fileids)
- pickle_fh = open(INDEXDB+'!','wb')
- pickle_fh.write(zlib.compress(cPickle.dumps(db, 1)))
-
-
-class SlicedZPickleIndexer(ZPickleIndexer):
- segments = "ABCDEFGHIJKLMNOPQRSTUVWXYZ#-!"
- def load_index(self, INDEXDB=None, reload=0, wordlist=None):
- # Unless reload is indicated, do not load twice
- if self.index_loaded() and not reload: return 0
+ if self.index_loaded() and not reload:
+ return 0
+
# Ok, now let's actually load it
- import cPickle, zlib
- INDEXDB = INDEXDB or self.indexdb
- db = Index({}, {'_TOP':(0,None)}, {})
+ db = {'WORDS': {}, 'FILES': {'_TOP':(0,None)}, 'FILEIDS': {}}
+
# Identify the relevant word-dictionary segments
if not wordlist:
segments = self.segments
else:
segments = ['-','#']
for word in wordlist:
- segments.append(string.upper(word[0]))
+ segments.append(word[0].upper())
+
# Load the segments
for segment in segments:
try:
- pickle_str = zlib.decompress(open(INDEXDB+segment,'rb').read())
- dbslice = cPickle.loads(pickle_str)
- if dbslice.__dict__.get('WORDS'): # If it has some words, add them
- for word,entry in dbslice.WORDS.items():
- db.WORDS[word] = entry
- if dbslice.__dict__.get('FILES'): # If it has some files, add them
- db.FILES = dbslice.FILES
- if dbslice.__dict__.get('FILEIDS'): # If it has fileids, add them
- db.FILEIDS = dbslice.FILEIDS
- except:
- pass # No biggie, couldn't find this segment
- self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS
-
- def julienne(self, INDEXDB=None):
- import cPickle, zlib
- INDEXDB = INDEXDB or self.indexdb
- segments = self.segments # all the (little) indexes
- for segment in segments:
- try: # brutal space saver... delete all the small segments
- os.remove(INDEXDB+segment)
- except OSError:
- pass # probably just nonexistent segment index file
+ f = open(self.indexdb + segment, 'rb')
+ except IOError, error:
+ # probably just nonexistent segment index file
+ if error.errno != errno.ENOENT: raise
+ else:
+ pickle_str = zlib.decompress(f.read())
+ f.close()
+ dbslice = marshal.loads(pickle_str)
+ if dbslice.get('WORDS'):
+ # if it has some words, add them
+ for word, entry in dbslice['WORDS'].items():
+ db['WORDS'][word] = entry
+ if dbslice.get('FILES'):
+ # if it has some files, add them
+ db['FILES'] = dbslice['FILES']
+ if dbslice.get('FILEIDS'):
+ # if it has fileids, add them
+ db['FILEIDS'] = dbslice['FILEIDS']
+
+ self.words = db['WORDS']
+ self.files = db['FILES']
+ self.fileids = db['FILEIDS']
+ self.changed = 0
+
+ def save_index(self):
+ # only save if the index is loaded and changed
+ if not self.index_loaded() or not self.changed:
+ return
+
+ # brutal space saver... delete all the small segments
+ for segment in self.segments:
+ try:
+ os.remove(self.indexdb + segment)
+ except OSError, error:
+ # probably just nonexistent segment index file
+ if error.errno != errno.ENOENT: raise
+
# First write the much simpler filename/fileid dictionaries
- dbfil = Index(None, self.files, self.fileids)
- open(INDEXDB+'-','wb').write(zlib.compress(cPickle.dumps(dbfil,1)))
+ dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids}
+ open(self.indexdb+'-','wb').write(zlib.compress(marshal.dumps(dbfil)))
+
# The hard part is splitting the word dictionary up, of course
- letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ letters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_"
segdicts = {} # Need batch of empty dicts
- for segment in letters+'#':
+ for segment in letters:
segdicts[segment] = {}
for word, entry in self.words.items(): # Split into segment dicts
- initchar = string.upper(word[0])
- if initchar in letters:
- segdicts[initchar][word] = entry
- else:
- segdicts['#'][word] = entry
- for initchar in letters+'#':
- db = Index(segdicts[initchar], None, None)
- pickle_str = cPickle.dumps(db, 1)
- filename = INDEXDB+initchar
- pickle_fh = open(filename,'wb')
+ initchar = word[0].upper()
+ segdicts[initchar][word] = entry
+
+ # save
+ for initchar in letters:
+ db = {'WORDS':segdicts[initchar], 'FILES':None, 'FILEIDS':None}
+ pickle_str = marshal.dumps(db)
+ filename = self.indexdb + initchar
+ pickle_fh = open(filename, 'wb')
pickle_fh.write(zlib.compress(pickle_str))
- os.chmod(filename,0664)
-
- save_index = julienne
-
-PreferredIndexer = SlicedZPickleIndexer
-
-#-- If called from command-line, parse arguments and take actions
-if __name__ == '__main__':
- import time
- start = time.time()
- search_words = [] # Word search list (if specified)
- opts = 0 # Any options specified?
- if len(sys.argv) < 2:
- pass # No options given
- else:
- upper = string.upper
- dir = os.getcwd() # Default to indexing from current directory
- descend = 1 # Default to recursive indexing
- ndx = PreferredIndexer()
- for opt in sys.argv[1:]:
- if opt in ('-h','/h','-?','/?','?','--help'): # help screen
- print __shell_usage__
- opts = -1
- break
- elif opt[0] in '/-': # a switch!
- opts = opts+1
- if upper(opt[1:]) == 'INDEX': # Index files
- ndx.quiet = 0
- pass # Use defaults if no other options
- elif upper(opt[1:]) == 'REINDEX': # Reindex
- ndx.reindex = 1
- elif upper(opt[1:]) == 'CASESENSITIVE': # Case sensitive
- ndx.casesensitive = 1
- elif upper(opt[1:]) in ('NORECURSE','LOCAL'): # No recursion
- descend = 0
- elif upper(opt[1:4]) == 'DIR': # Dir to index
- dir = opt[5:]
- elif upper(opt[1:8]) == 'INDEXDB': # Index specified
- ndx.indexdb = opt[9:]
- sys.stderr.write(
- "Use of INDEXER_DB environment variable is STRONGLY recommended.\n")
- elif upper(opt[1:6]) == 'REGEX': # RegEx files to index
- ndx.add_pattern = re.compile(opt[7:])
- elif upper(opt[1:5]) == 'GLOB': # Glob files to index
- ndx.add_pattern = opt[6:]
- elif upper(opt[1:7]) in ('OUTPUT','FORMAT'): # How should results look?
- opts = opts-1 # this is not an option for indexing purposes
- level = upper(opt[8:])
- if level in ('ALL','EVERYTHING','VERBOSE', 'MAX'):
- ndx.quiet = 0
- elif level in ('RATINGS','SCORES','HIGH'):
- ndx.quiet = 3
- elif level in ('FILENAMES','NAMES','FILES','MID'):
- ndx.quiet = 5
- elif level in ('SUMMARY','MIN'):
- ndx.quiet = 9
- elif upper(opt[1:7]) == 'FILTER': # Regex filter output
- opts = opts-1 # this is not an option for indexing purposes
- ndx.filter = opt[8:]
- elif opt[1:] in string.digits:
- opts = opts-1
- ndx.quiet = eval(opt[1])
- else:
- search_words.append(opt) # Search words
-
- if opts > 0:
- ndx.add_files(dir=dir)
- ndx.save_index()
- if search_words:
- ndx.find(search_words, print_report=1)
- if not opts and not search_words:
- sys.stderr.write("Perhaps you would like to use the --help option?\n")
- else:
- sys.stderr.write('Processed in %.3f seconds (%s)'
- % (time.time()-start, ndx.whoami()))
+ os.chmod(filename, 0664)
+
+ # save done
+ self.changed = 0
+
+ def purge_entry(self, identifier):
+ ''' Remove a file from file index and word index
+ '''
+ self.load_index()
+
+ if not self.files.has_key(identifier):
+ return
+
+ file_index = self.files[identifier][0]
+ del self.files[identifier]
+ del self.fileids[file_index]
+
+ # The much harder part, cleanup the word index
+ for key, occurs in self.words.items():
+ if occurs.has_key(file_index):
+ del occurs[file_index]
+
+ # save needed
+ self.changed = 1
+
+ def index_loaded(self):
+ return (hasattr(self,'fileids') and hasattr(self,'files') and
+ hasattr(self,'words'))
#
#$Log: not supported by cvs2svn $
-#Revision 1.1.2.3 2002/04/03 12:05:15 rochecompaan
-#Removed dos control characters.
+#Revision 1.11 2002/07/18 11:17:30 gmcm
+#Add Number and Boolean types to hyperdb.
+#Add conversion cases to web, mail & admin interfaces.
+#Add storage/serialization cases to back_anydbm & back_metakit.
+#
+#Revision 1.10 2002/07/14 23:17:24 richard
+#oops
+#
+#Revision 1.9 2002/07/14 06:11:16 richard
+#Some TODOs
+#
+#Revision 1.8 2002/07/09 21:53:38 gmcm
+#Optimize Class.find so that the propspec can contain a set of ids to match.
+#This is used by indexer.search so it can do just one find for all the index matches.
+#This was already confusing code, but for common terms (lots of index matches),
+#it is enormously faster.
+#
+#Revision 1.7 2002/07/09 21:38:43 richard
+#Only save the index if the thing is loaded and changed. Also, don't load
+#the index just for a save.
+#
+#Revision 1.6 2002/07/09 04:26:44 richard
+#We're indexing numbers now, and _underscore words
+#
+#Revision 1.5 2002/07/09 04:19:09 richard
+#Added reindex command to roundup-admin.
+#Fixed reindex on first access.
+#Also fixed reindexing of entries that change.
+#
+#Revision 1.4 2002/07/09 03:02:52 richard
+#More indexer work:
+#- all String properties may now be indexed too. Currently there's a bit of
+# "issue" specific code in the actual searching which needs to be
+# addressed. In a nutshell:
+# + pass 'indexme="yes"' as a String() property initialisation arg, eg:
+# file = FileClass(db, "file", name=String(), type=String(),
+# comment=String(indexme="yes"))
+# + the comment will then be indexed and be searchable, with the results
+# related back to the issue that the file is linked to
+#- as a result of this work, the FileClass has a default MIME type that may
+# be overridden in a subclass, or by the use of a "type" property as is
+# done in the default templates.
+#- the regeneration of the indexes (if necessary) is done once the schema is
+# set up in the dbinit.
+#
+#Revision 1.3 2002/07/08 06:58:15 richard
+#cleaned up the indexer code:
+# - it splits more words out (much simpler, faster splitter)
+# - removed code we'll never use (roundup.roundup_indexer has the full
+# implementation, and replaces roundup.indexer)
+# - only index text/plain and rfc822/message (ideas for other text formats to
+# index are welcome)
+# - added simple unit test for indexer. Needs more tests for regression.
+#
+#Revision 1.2 2002/05/25 07:16:24 rochecompaan
+#Merged search_indexing-branch with HEAD
+#
+#Revision 1.1.2.3 2002/05/02 11:52:12 rochecompaan
+#Fixed small bug that prevented indexes from being generated.
+#
+#Revision 1.1.2.2 2002/04/19 19:54:42 rochecompaan
+#cgi_client.py
+# removed search link for the time being
+# moved rendering of matches to htmltemplate
+#hyperdb.py
+# filtering of nodes on full text search incorporated in filter method
+#roundupdb.py
+# added paramater to call of filter method
+#roundup_indexer.py
+# added search method to RoundupIndexer class
#
-#Revision 1.1.2.2 2002/04/03 12:01:55 rochecompaan
-#Oops. Forgot to include cvs keywords in file.
+#Revision 1.1.2.1 2002/04/03 11:55:57 rochecompaan
+# . Added feature #526730 - search for messages capability
#