From: richard Date: Mon, 8 Jul 2002 06:58:15 +0000 (+0000) Subject: cleaned up the indexer code: X-Git-Url: https://git.tokkee.org/?a=commitdiff_plain;h=6ed6a9519397504546d46411b3bfca7d196370b1;p=roundup.git cleaned up the indexer code: - it splits more words out (much simpler, faster splitter) - removed code we'll never use (roundup.roundup_indexer has the full implementation, and replaces roundup.indexer) - only index text/plain and rfc822/message (ideas for other text formats to index are welcome) - added simple unit test for indexer. Needs more tests for regression. git-svn-id: http://svn.roundup-tracker.org/svnroot/roundup/trunk@832 57a73879-2fb5-44c3-a270-3262357dd7e2 --- diff --git a/CHANGES.txt b/CHANGES.txt index a2621e1..7b602e3 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,10 +1,18 @@ This file contains the changes to the Roundup system over time. The entries are given with the most recent entry first. -2002-??-?? 0.?.? +2002-??-?? 0.5.0 Fixed: . #576086 ] dumb copying mistake (frontends/ZRoundup.py) . installation instructions now mention "python2" in "testing your python". + . cleaned up the indexer code: + - it splits more words out + - removed code we'll never use (roundup.roundup_indexer has the full + implementation, and replaces roundup.indexer) + - only index text/plain and rfc822/message (ideas for other text formats to + index are welcome) + - added simple unit test for indexer. Needs more tests for regression. + . made the unit tests run again - they were quite b0rken 2002-06-24 0.4.2 diff --git a/roundup/backends/back_anydbm.py b/roundup/backends/back_anydbm.py index 46f1df6..5fb8b65 100644 --- a/roundup/backends/back_anydbm.py +++ b/roundup/backends/back_anydbm.py @@ -15,7 +15,7 @@ # BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE, # SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. # -#$Id: back_anydbm.py,v 1.37 2002-06-20 23:52:35 richard Exp $ +#$Id: back_anydbm.py,v 1.38 2002-07-08 06:58:15 richard Exp $ ''' This module defines a backend that saves the hyperdatabase in a database chosen by anydbm. It is guaranteed to always be available in python @@ -26,7 +26,7 @@ serious bugs, and is not available) import whichdb, anydbm, os, marshal from roundup import hyperdb, date from blobfiles import FileStorage -from roundup.roundup_indexer import RoundupIndexer +from roundup.indexer import Indexer from locking import acquire_lock, release_lock # @@ -62,7 +62,7 @@ class Database(FileStorage, hyperdb.Database): self.dirtynodes = {} # keep track of the dirty nodes by class self.newnodes = {} # keep track of the new nodes by class self.transactions = [] - self.indexer = RoundupIndexer(self.dir) + self.indexer = Indexer(self.dir) # ensure files are group readable and writable os.umask(0002) @@ -160,7 +160,7 @@ class Database(FileStorage, hyperdb.Database): except ImportError: raise hyperdb.DatabaseError, \ "Couldn't open database - the required module '%s'"\ - "is not available"%db_type + " is not available"%db_type if __debug__: print >>hyperdb.DEBUG, "_opendb %r.open(%r, %r)"%(db_type, path, mode) @@ -486,6 +486,9 @@ class Database(FileStorage, hyperdb.Database): # #$Log: not supported by cvs2svn $ +#Revision 1.37 2002/06/20 23:52:35 richard +#More informative error message +# #Revision 1.36 2002/06/19 03:07:19 richard #Moved the file storage commit into blobfiles where it belongs. # diff --git a/roundup/backends/blobfiles.py b/roundup/backends/blobfiles.py index 3f110e5..86ff228 100644 --- a/roundup/backends/blobfiles.py +++ b/roundup/backends/blobfiles.py @@ -15,13 +15,13 @@ # BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE, # SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. # -#$Id: blobfiles.py,v 1.4 2002-06-19 03:07:19 richard Exp $ +#$Id: blobfiles.py,v 1.5 2002-07-08 06:58:15 richard Exp $ ''' This module exports file storage for roundup backends. Files are stored into a directory hierarchy. ''' -import os, os.path +import os def files_in_dir(dir): if not os.path.exists(dir): @@ -106,11 +106,13 @@ class FileStorage: ''' # the file is currently ".tmp" - move it to its real name to commit os.rename(name+".tmp", name) - pattern = name.split('/')[-1] - self.indexer.add_files(dir=os.path.dirname(name), pattern=pattern) + self.indexer.add_file(name) self.indexer.save_index() # $Log: not supported by cvs2svn $ +# Revision 1.4 2002/06/19 03:07:19 richard +# Moved the file storage commit into blobfiles where it belongs. +# # Revision 1.3 2002/02/27 07:33:34 grubert # . add, vim line and cvs log key. # diff --git a/roundup/indexer.py b/roundup/indexer.py index 47f0120..8b2f615 100644 --- a/roundup/indexer.py +++ b/roundup/indexer.py @@ -1,390 +1,105 @@ -#!/usr/bin/env python - -"""Create full-text indexes and search them - -Notes: - - See http://gnosis.cx/publish/programming/charming_python_15.txt - for a detailed discussion of this module. - - This version requires Python 1.6+. It turns out that the use - of string methods rather than [string] module functions is - enough faster in a tight loop so as to provide a quite - remarkable 25% speedup in overall indexing. However, only FOUR - lines in TextSplitter.text_splitter() were changed away from - Python 1.5 compatibility. Those lines are followed by comments - beginning with "# 1.52: " that show the old forms. Python - 1.5 users can restore these lines, and comment out those just - above them. - -Classes: - - GenericIndexer -- Abstract class - TextSplitter -- Mixin class - Index - ShelveIndexer - FlatIndexer - XMLPickleIndexer - PickleIndexer - ZPickleIndexer - SlicedZPickleIndexer - -Functions: - - echo_fname(fname) - recurse_files(...) - -Index Formats: - - *Indexer.files: filename --> (fileid, wordcount) - *Indexer.fileids: fileid --> filename - *Indexer.words: word --> {fileid1:occurs, fileid2:occurs, ...} - -Module Usage: - - There are a few ways to use this module. Just to utilize existing - functionality, something like the following is a likely - pattern: - - import gnosis.indexer as indexer - index = indexer.MyFavoriteIndexer() # For some concrete Indexer - index.load_index('myIndex.db') - index.add_files(dir='/this/that/otherdir', pattern='*.txt') - hits = index.find(['spam','eggs','bacon']) - index.print_report(hits) - - To customize the basic classes, something like the following is likely: - - class MySplitter: - def splitter(self, text, ftype): - "Peform much better splitting than default (for filetypes)" - # ... - return words - - class MyIndexer(indexer.GenericIndexer, MySplitter): - def load_index(self, INDEXDB=None): - "Retrieve three dictionaries from clever storage method" - # ... - self.words, self.files, self.fileids = WORDS, FILES, FILEIDS - def save_index(self, INDEXDB=None): - "Save three dictionaries to clever storage method" - - index = MyIndexer() - # ...etc... - -Benchmarks: - - As we know, there are lies, damn lies, and benchmarks. Take - the below with an adequate dose of salt. In version 0.10 of - the concrete indexers, some performance was tested. The - test case was a set of mail/news archives, that were about - 43 mB, and 225 files. In each case, an index was generated - (if possible), and a search for the words "xml python" was - performed. - - - Index w/ PickleIndexer: 482s, 2.4 mB - - Search w/ PickleIndexer: 1.74s - - Index w/ ZPickleIndexer: 484s, 1.2 mB - - Search w/ ZPickleIndexer: 1.77s - - Index w/ FlatIndexer: 492s, 2.6 mB - - Search w/ FlatIndexer: 53s - - Index w/ ShelveIndexer: (dumbdbm) Many minutes, tens of mBs - - Search w/ ShelveIndexer: Aborted before completely indexed - - Index w/ ShelveIndexer: (dbhash) Long time (partial crash), 10 mB - - Search w/ ShelveIndexer: N/A. Too many glitches - - Index w/ XMLPickleIndexer: Memory error (xml_pickle uses bad string - composition for large output) - - Search w/ XMLPickleIndexer: N/A - - grep search (xml|python): 20s (cached: <5s) - - 'srch' utility (python): 12s -""" -#$Id: indexer.py,v 1.2 2002-05-25 07:16:24 rochecompaan Exp $ - -__shell_usage__ = """ -Shell Usage: [python] indexer.py [options] [search_words] - - -h, /h, -?, /?, ?, --help: Show this help screen - -index: Add files to index - -reindex: Refresh files already in the index - (can take much more time) - -casesensitive: Maintain the case of indexed words - (can lead to MUCH larger indices) - -norecurse, -local: Only index starting dir, not subdirs - -dir=: Starting directory for indexing - (default is current directory) - -indexdb=: Use specified index database - (environ variable INDEXER_DB is preferred) - -regex=: Index files matching regular expression - -glob=: Index files matching glob pattern - -filter= Only display results matching pattern - -output=, -format=: How much detail on matches? - -: Quiet level (0=verbose ... 9=quiet) - -Output/format options are ALL/EVERYTHING/VERBOSE, RATINGS/SCORES, -FILENAMES/NAMES/FILES, SUMMARY/REPORT""" - -__version__ = "$Revision: 1.2 $" -__author__=["David Mertz (mertz@gnosis.cx)",] -__thanks_to__=["Pat Knight (p.knight@ktgroup.co.uk)", - "Gregory Popovitch (greg@gpy.com)", ] -__copyright__=""" - This file is released to the public domain. I (dqm) would - appreciate it if you choose to keep derived works under terms - that promote freedom, but obviously am giving up any rights - to compel such. -""" - -__history__=""" - 0.1 Initial version. - - 0.11 Tweaked TextSplitter after some random experimentation. - - 0.12 Added SlicedZPickleIndexer (best choice, so far). - - 0.13 Pat Knight pointed out need for binary open()'s of - certain files under Windows. - - 0.14 Added '-filter' switch to search results. - - 0.15 Added direct read of gzip files - - 0.20 Gregory Popovitch did some profiling on TextSplitter, - and provided both huge speedups to the Python version - and hooks to a C extension class (ZopeTextSplitter). - A little refactoring by he and I (dqm) has nearly - doubled the speed of indexing - - 0.30 Module refactored into gnosis package. This is a - first pass, and various documentation and test cases - should be added later. -""" -import string, re, os, fnmatch, sys, copy, gzip -from types import * - -#-- Silly "do nothing" default recursive file processor -def echo_fname(fname): print fname - -#-- "Recurse and process files" utility function -def recurse_files(curdir, pattern, exclusions, func=echo_fname, *args, **kw): - "Recursively process file pattern" - subdirs, files = [],[] - level = kw.get('level',0) - - for name in os.listdir(curdir): - fname = os.path.join(curdir, name) - if name[-4:] in exclusions: - pass # do not include binary file type - elif os.path.isdir(fname) and not os.path.islink(fname): - subdirs.append(fname) - # kludge to detect a regular expression across python versions - elif sys.version[0]=='1' and isinstance(pattern, re.RegexObject): - if pattern.match(name): - files.append(fname) - elif sys.version[0]=='2' and type(pattern)==type(re.compile('')): - if pattern.match(name): - files.append(fname) - elif type(pattern) is StringType: - if fnmatch.fnmatch(name, pattern): - files.append(fname) - - for fname in files: - apply(func, (fname,)+args) - for subdir in subdirs: - recurse_files(subdir, pattern, exclusions, func, level=level+1) - -#-- Data bundle for index dictionaries -class Index: - def __init__(self, words, files, fileids): - if words is not None: self.WORDS = words - if files is not None: self.FILES = files - if fileids is not None: self.FILEIDS = fileids - -#-- "Split plain text into words" utility function -class TextSplitter: - def initSplitter(self): - prenum = string.join(map(chr, range(0,48)), '') - num2cap = string.join(map(chr, range(58,65)), '') - cap2low = string.join(map(chr, range(91,97)), '') - postlow = string.join(map(chr, range(123,256)), '') - nonword = prenum + num2cap + cap2low + postlow - self.word_only = string.maketrans(nonword, " "*len(nonword)) - self.nondigits = string.join(map(chr, range(0,48)) + map(chr, range(58,255)), '') - self.alpha = string.join(map(chr, range(65,91)) + map(chr, range(97,123)), '') - self.ident = string.join(map(chr, range(256)), '') - self.init = 1 - - def splitter(self, text, ftype): - "Split the contents of a text string into a list of 'words'" - if ftype == 'text/plain': - words = self.text_splitter(text, self.casesensitive) +# +# This module is derived from the module described at: +# http://gnosis.cx/publish/programming/charming_python_15.txt +# +# Author: David Mertz (mertz@gnosis.cx) +# Thanks to: Pat Knight (p.knight@ktgroup.co.uk) +# Gregory Popovitch (greg@gpy.com) +# +# The original module was released under this license, and remains under +# it: +# +# This file is released to the public domain. I (dqm) would +# appreciate it if you choose to keep derived works under terms +# that promote freedom, but obviously am giving up any rights +# to compel such. +# +#$Id: indexer.py,v 1.3 2002-07-08 06:58:15 richard Exp $ +''' +This module provides an indexer class, RoundupIndexer, that stores text +indices in a roundup instance. This class makes searching the content of +messages and text files possible. +''' +import os, shutil, re, mimetypes, marshal, zlib, errno + +class Indexer: + ''' Indexes messages and files. + + This implements a new splitter based on re.findall '\w+' and the + add_othertext method. + ''' + def __init__(self, db_path): + indexdb_path = os.path.join(db_path, 'indexes') + + # see if we need to reindex because of a change in code + if (os.path.exists(indexdb_path) and + not os.path.exists(os.path.join(indexdb_path, 'version'))): + shutil.rmtree(indexdb_path) + + # see if the index exists + index_exists = 0 + if not os.path.exists(indexdb_path): + os.makedirs(indexdb_path) + os.chmod(indexdb_path, 0775) + open(os.path.join(indexdb_path, 'version'), 'w').write('1\n') else: - raise NotImplementedError - return words - - def text_splitter(self, text, casesensitive=0): - """Split text/plain string into a list of words - - In version 0.20 this function is still fairly weak at - identifying "real" words, and excluding gibberish - strings. As long as the indexer looks at "real" text - files, it does pretty well; but if indexing of binary - data is attempted, a lot of gibberish gets indexed. - Suggestions on improving this are GREATLY APPRECIATED. - """ - # Initialize some constants - if not hasattr(self,'init'): self.initSplitter() - - # Speedup trick: attributes into local scope - word_only = self.word_only - ident = self.ident - alpha = self.alpha - nondigits = self.nondigits - translate = string.translate - - # Let's adjust case if not case-sensitive - if not casesensitive: text = string.upper(text) - - # Split the raw text - allwords = string.split(text) - - # Finally, let's skip some words not worth indexing - words = [] - for word in allwords: - if len(word) > 25: continue # too long (probably gibberish) - - # Identify common patterns in non-word data (binary, UU/MIME, etc) - num_nonalpha = len(word.translate(ident, alpha)) - numdigits = len(word.translate(ident, nondigits)) - # 1.52: num_nonalpha = len(translate(word, ident, alpha)) - # 1.52: numdigits = len(translate(word, ident, nondigits)) - if numdigits > len(word)-2: # almost all digits - if numdigits > 5: # too many digits is gibberish - continue # a moderate number is year/zipcode/etc - elif num_nonalpha*3 > len(word): # too much scattered nonalpha = gibberish - continue - - word = word.translate(word_only) # Let's strip funny byte values - # 1.52: word = translate(word, word_only) - subwords = word.split() # maybe embedded non-alphanumeric - # 1.52: subwords = string.split(word) - for subword in subwords: # ...so we might have subwords - if len(subword) <= 2: continue # too short a subword - words.append(subword) - return words - -class ZopeTextSplitter: - def initSplitter(self): - import Splitter - stop_words=( - 'am', 'ii', 'iii', 'per', 'po', 're', 'a', 'about', 'above', 'across', - 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', - 'along', 'already', 'also', 'although', 'always', 'am', 'among', - 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', - 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around', - 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', - 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', - 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both', - 'bottom', 'but', 'by', 'can', 'cannot', 'cant', 'con', 'could', - 'couldnt', 'cry', 'describe', 'detail', 'do', 'done', 'down', 'due', - 'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else', - 'elsewhere', 'empty', 'enough', 'even', 'ever', 'every', 'everyone', - 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty', - 'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly', - 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get', - 'give', 'go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her', - 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', - 'herself', 'him', 'himself', 'his', 'how', 'however', 'hundred', 'i', - 'ie', 'if', 'in', 'inc', 'indeed', 'interest', 'into', 'is', 'it', - 'its', 'itself', 'keep', 'last', 'latter', 'latterly', 'least', - 'less', 'made', 'many', 'may', 'me', 'meanwhile', 'might', 'mill', - 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', - 'my', 'myself', 'name', 'namely', 'neither', 'never', 'nevertheless', - 'next', 'nine', 'no', 'nobody', 'none', 'noone', 'nor', 'not', - 'nothing', 'now', 'nowhere', 'of', 'off', 'often', 'on', 'once', - 'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'our', - 'ours', 'ourselves', 'out', 'over', 'own', 'per', 'perhaps', - 'please', 'pre', 'put', 'rather', 're', 'same', 'see', 'seem', - 'seemed', 'seeming', 'seems', 'serious', 'several', 'she', 'should', - 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some', - 'somehow', 'someone', 'something', 'sometime', 'sometimes', - 'somewhere', 'still', 'such', 'take', 'ten', 'than', 'that', 'the', - 'their', 'them', 'themselves', 'then', 'thence', 'there', - 'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these', - 'they', 'thick', 'thin', 'third', 'this', 'those', 'though', 'three', - 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', - 'toward', 'towards', 'twelve', 'twenty', 'two', 'un', 'under', - 'until', 'up', 'upon', 'us', 'very', 'via', 'was', 'we', 'well', - 'were', 'what', 'whatever', 'when', 'whence', 'whenever', 'where', - 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', - 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever', - 'whole', 'whom', 'whose', 'why', 'will', 'with', 'within', 'without', - 'would', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves', - ) - self.stop_word_dict={} - for word in stop_words: self.stop_word_dict[word]=None - self.splitterobj = Splitter.getSplitter() - self.init = 1 - - def goodword(self, word): - return len(word) < 25 - - def splitter(self, text, ftype): - """never case-sensitive""" - if not hasattr(self,'init'): self.initSplitter() - return filter(self.goodword, self.splitterobj(text, self.stop_word_dict)) - - -#-- "Abstract" parent class for inherited indexers -# (does not handle storage in parent, other methods are primitive) - -class GenericIndexer: - def __init__(self, **kw): - apply(self.configure, (), kw) - - def whoami(self): - return self.__class__.__name__ - - def configure(self, REINDEX=0, CASESENSITIVE=0, - INDEXDB=os.environ.get('INDEXER_DB', 'TEMP_NDX.DB'), - ADD_PATTERN='*', QUIET=5): - "Configure settings used by indexing and storage/retrieval" - self.indexdb = INDEXDB - self.reindex = REINDEX - self.casesensitive = CASESENSITIVE - self.add_pattern = ADD_PATTERN - self.quiet = QUIET - self.filter = None - - def add_files(self, dir=os.getcwd(), pattern=None, descend=1): - self.load_index() - exclusions = ('.zip','.pyc','.gif','.jpg','.dat','.dir') - if not pattern: - pattern = self.add_pattern - recurse_files(dir, pattern, exclusions, self.add_file) + index_exists = 1 + + # save off the path to the indexdb + self.indexdb = os.path.join(indexdb_path, 'index.db') + self.reindex = 0 + self.casesensitive = 0 + self.quiet = 9 + + if not index_exists: + # index everything + files_path = os.path.join(db_path, 'files') + self.add_files(dir=files_path) + self.save_index() + + # override add_files so it's a little smarter about file types + def add_files(self, dir): + if not hasattr(self, 'files'): + self.load_index() + os.path.walk(dir, self.walk_add_file, None) # Rebuild the fileid index self.fileids = {} for fname in self.files.keys(): fileid = self.files[fname][0] self.fileids[fileid] = fname - def add_file(self, fname, ftype='text/plain'): - "Index the contents of a regular file" - if self.files.has_key(fname): # Is file eligible for (re)indexing? - if self.reindex: # Reindexing enabled, cleanup dicts + # override add_file so it can be a little smarter about determining the + # file type + def walk_add_file(self, arg, dname, names, ftype=None): + for name in names: + name = os.path.join(dname, name) + if os.path.isfile(name): + self.add_file(name) + elif os.path.isdir(name): + os.path.walk(name, self.walk_add_file, None) + def add_file(self, fname, ftype=None): + ''' Index the contents of a regular file + ''' + if not hasattr(self, 'files'): + self.load_index() + # Is file eligible for (re)indexing? + if self.files.has_key(fname): + if self.reindex: + # Reindexing enabled, cleanup dicts self.purge_entry(fname, self.files, self.words) - else: # DO NOT reindex this file - if self.quiet < 5: print "Skipping", fname + else: + # DO NOT reindex this file + if self.quiet < 5: + print "Skipping", fname return 0 - # Read in the file (if possible) - try: - if fname[-3:] == '.gz': - text = gzip.open(fname).read() - else: - text = open(fname).read() - if self.quiet < 5: print "Indexing", fname - except IOError: - return 0 + # guess the file type + if ftype is None: + ftype = mimetypes.guess_type(fname) + + # read in the file + text = open(fname).read() + if self.quiet < 5: print "Indexing", fname words = self.splitter(text, ftype) # Find new file index, and assign it to filename @@ -408,383 +123,254 @@ class GenericIndexer: entry[file_index] = filedict[word] self.words[word] = entry - def add_othertext(self, identifier): - """Index a textual source other than a plain file + # NOTE: this method signature deviates from the one specified in + # indexer - I'm not entirely sure where it was expected to the text + # from otherwise... + def add_othertext(self, identifier, text): + ''' Add some text associated with the identifier + ''' + # Is file eligible for (re)indexing? + if self.files.has_key(identifier): + # Reindexing enabled, cleanup dicts + if self.reindex: + self.purge_entry(identifier, self.files, self.words) + else: + # DO NOT reindex this file + if self.quiet < 5: + print "Not reindexing", identifier + return 0 - A child class might want to implement this method (or a similar one) - in order to index textual sources such as SQL tables, URLs, clay - tablets, or whatever else. The identifier should uniquely pick out - the source of the text (whatever it is) - """ - raise NotImplementedError + # split into words + words = self.splitter(text, 'text/plain') - def save_index(self, INDEXDB=None): - raise NotImplementedError + # Find new file index, and assign it to identifier + # (_TOP uses trick of negative to avoid conflict with file index) + self.files['_TOP'] = (self.files['_TOP'][0]-1, None) + file_index = abs(self.files['_TOP'][0]) + self.files[identifier] = (file_index, len(words)) + self.fileids[file_index] = identifier + + # find the unique words + filedict = {} + for word in words: + if filedict.has_key(word): + filedict[word] = filedict[word]+1 + else: + filedict[word] = 1 - def load_index(self, INDEXDB=None, reload=0, wordlist=None): - raise NotImplementedError + # now add to the totals + for word in filedict.keys(): + # each word has a dict of {identifier: count} + if self.words.has_key(word): + entry = self.words[word] + else: + # new word + entry = {} + self.words[word] = entry - def find(self, wordlist, print_report=0): - "Locate files that match ALL the words in wordlist" + # make a reference to the file for this word + entry[file_index] = filedict[word] + + def splitter(self, text, ftype): + ''' Split the contents of a text string into a list of 'words' + ''' + if ftype in ('text/plain', 'message/rfc822'): + words = self.text_splitter(text, self.casesensitive) + else: + return [] + return words + + def text_splitter(self, text, casesensitive=0): + """Split text/plain string into a list of words + """ + # Let's adjust case if not case-sensitive + if not casesensitive: + text = text.upper() + + # Split the raw text, losing anything longer than 25 characters + # since that'll be gibberish (encoded text or somesuch) or shorter + # than 3 characters since those short words appear all over the + # place + return re.findall(r'\b\w{2,25}\b', text) + + def search(self, search_terms, klass): + ''' display search results + ''' + hits = self.find(search_terms) + links = [] + nodeids = {} + designator_propname = {'msg': 'messages', 'file': 'files'} + if hits: + hitcount = len(hits) + # build a dictionary of nodes and their associated messages + # and files + for hit in hits.keys(): + filename = hits[hit].split('/')[-1] + for designator, propname in designator_propname.items(): + if not filename.startswith(designator): + continue + nodeid = filename[len(designator):] + result = apply(klass.find, (), {propname:nodeid}) + if not result: + continue + + id = str(result[0]) + if not nodeids.has_key(id): + nodeids[id] = {} + + node_dict = nodeids[id] + if not node_dict.has_key(propname): + node_dict[propname] = [nodeid] + elif node_dict.has_key(propname): + node_dict[propname].append(nodeid) + + return nodeids + + # we override this to ignore not 2 < word < 25 and also to fix a bug - + # the (fail) case. + def find(self, wordlist): + ''' Locate files that match ALL the words in wordlist + ''' + if not hasattr(self, 'words'): + self.load_index() self.load_index(wordlist=wordlist) entries = {} - hits = copy.copy(self.fileids) # Copy of fileids index + hits = None for word in wordlist: + if not 2 < len(word) < 25: + # word outside the bounds of what we index - ignore + continue if not self.casesensitive: - word = string.upper(word) + word = word.upper() entry = self.words.get(word) # For each word, get index entries[word] = entry # of matching files if not entry: # Nothing for this one word (fail) - return 0 - for fileid in hits.keys(): # Eliminate hits for every non-match - if not entry.has_key(fileid): - del hits[fileid] - if print_report: - self.print_report(hits, wordlist, entries) - return hits - - def print_report(self, hits={}, wordlist=[], entries={}): - # Figure out what to actually print (based on QUIET level) - output = [] - for fileid,fname in hits.items(): - message = fname - if self.quiet <= 3: - wordcount = self.files[fname][1] - matches = 0 - countmess = '\n'+' '*13+`wordcount`+' words; ' - for word in wordlist: - if not self.casesensitive: - word = string.upper(word) - occurs = entries[word][fileid] - matches = matches+occurs - countmess = countmess +`occurs`+' '+word+'; ' - message = string.ljust('[RATING: ' - +`1000*matches/wordcount`+']',13)+message - if self.quiet <= 2: message = message +countmess +'\n' - if self.filter: # Using an output filter - if fnmatch.fnmatch(message, self.filter): - output.append(message) + return {} + if hits is None: + hits = {} + for k in entry.keys(): + hits[k] = self.fileids[k] else: - output.append(message) - - if self.quiet <= 5: - print string.join(output,'\n') - sys.stderr.write('\n'+`len(output)`+' files matched wordlist: '+ - `wordlist`+'\n') - return output - - def purge_entry(self, fname, file_dct, word_dct): - "Remove a file from file index and word index" - try: # The easy part, cleanup the file index - file_index = file_dct[fname] - del file_dct[fname] - except KeyError: - pass # We'll assume we only encounter KeyError's - # The much harder part, cleanup the word index - for word, occurs in word_dct.items(): - if occurs.has_key(file_index): - del occurs[file_index] - word_dct[word] = occurs + # Eliminate hits for every non-match + for fileid in hits.keys(): + if not entry.has_key(fileid): + del hits[fileid] + if hits is None: + return {} + return hits - def index_loaded(self): - return ( hasattr(self,'fileids') and - hasattr(self,'files') and - hasattr(self,'words') ) - -#-- Provide an actual storage facility for the indexes (i.e. shelve) -class ShelveIndexer(GenericIndexer, TextSplitter): - """Concrete Indexer utilizing [shelve] for storage - - Unfortunately, [shelve] proves far too slow in indexing, while - creating monstrously large indexes. Not recommend, at least under - the default dbm's tested. Also, class may be broken because - shelves do not, apparently, support the .values() and .items() - methods. Fixing this is a low priority, but the sample code is - left here. - """ - def load_index(self, INDEXDB=None, reload=0, wordlist=None): - INDEXDB = INDEXDB or self.indexdb - import shelve - self.words = shelve.open(INDEXDB+".WORDS") - self.files = shelve.open(INDEXDB+".FILES") - self.fileids = shelve.open(INDEXDB+".FILEIDS") - if not FILES: # New index - self.files['_TOP'] = (0,None) - - def save_index(self, INDEXDB=None): - INDEXDB = INDEXDB or self.indexdb - pass - -class FlatIndexer(GenericIndexer, TextSplitter): - """Concrete Indexer utilizing flat-file for storage - - See the comments in the referenced article for details; in - brief, this indexer has about the same timing as the best in - -creating- indexes and the storage requirements are - reasonable. However, actually -using- a flat-file index is - more than an order of magnitude worse than the best indexer - (ZPickleIndexer wins overall). - - On the other hand, FlatIndexer creates a wonderfully easy to - parse database format if you have a reason to transport the - index to a different platform or programming language. And - should you perform indexing as part of a long-running - process, the overhead of initial file parsing becomes - irrelevant. - """ - def load_index(self, INDEXDB=None, reload=0, wordlist=None): - # Unless reload is indicated, do not load twice - if self.index_loaded() and not reload: return 0 - # Ok, now let's actually load it - INDEXDB = INDEXDB or self.indexdb - self.words = {} - self.files = {'_TOP':(0,None)} - self.fileids = {} - try: # Read index contents - for line in open(INDEXDB).readlines(): - fields = string.split(line) - if fields[0] == '-': # Read a file/fileid line - fileid = eval(fields[2]) - wordcount = eval(fields[3]) - fname = fields[1] - self.files[fname] = (fileid, wordcount) - self.fileids[fileid] = fname - else: # Read a word entry (dict of hits) - entries = {} - word = fields[0] - for n in range(1,len(fields),2): - fileid = eval(fields[n]) - occurs = eval(fields[n+1]) - entries[fileid] = occurs - self.words[word] = entries - except: - pass # New index - - def save_index(self, INDEXDB=None): - INDEXDB = INDEXDB or self.indexdb - tab, lf, sp = '\t','\n',' ' - indexdb = open(INDEXDB,'w') - for fname,entry in self.files.items(): - indexdb.write('- '+fname +tab +`entry[0]` +tab +`entry[1]` +lf) - for word,entry in self.words.items(): - indexdb.write(word +tab+tab) - for fileid,occurs in entry.items(): - indexdb.write(`fileid` +sp +`occurs` +sp) - indexdb.write(lf) - -class PickleIndexer(GenericIndexer, TextSplitter): - def load_index(self, INDEXDB=None, reload=0, wordlist=None): - # Unless reload is indicated, do not load twice - if self.index_loaded() and not reload: return 0 - # Ok, now let's actually load it - import cPickle - INDEXDB = INDEXDB or self.indexdb - try: - pickle_str = open(INDEXDB,'rb').read() - db = cPickle.loads(pickle_str) - except: # New index - db = Index({}, {'_TOP':(0,None)}, {}) - self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS - - def save_index(self, INDEXDB=None): - import cPickle - INDEXDB = INDEXDB or self.indexdb - db = Index(self.words, self.files, self.fileids) - open(INDEXDB,'wb').write(cPickle.dumps(db, 1)) - -class XMLPickleIndexer(PickleIndexer): - """Concrete Indexer utilizing XML for storage - - While this is, as expected, a verbose format, the possibility - of using XML as a transport format for indexes might be - useful. However, [xml_pickle] is in need of some redesign to - avoid gross inefficiency when creating very large - (multi-megabyte) output files (fixed in [xml_pickle] version - 0.48 or above) - """ - def load_index(self, INDEXDB=None, reload=0, wordlist=None): - # Unless reload is indicated, do not load twice - if self.index_loaded() and not reload: return 0 - # Ok, now let's actually load it - from gnosis.xml.pickle import XML_Pickler - INDEXDB = INDEXDB or self.indexdb - try: # XML file exists - xml_str = open(INDEXDB).read() - db = XML_Pickler().loads(xml_str) - except: # New index - db = Index({}, {'_TOP':(0,None)}, {}) - self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS - - def save_index(self, INDEXDB=None): - from gnosis.xml.pickle import XML_Pickler - INDEXDB = INDEXDB or self.indexdb - db = Index(self.words, self.files, self.fileids) - open(INDEXDB,'w').write(XML_Pickler(db).dumps()) - -class ZPickleIndexer(PickleIndexer): - def load_index(self, INDEXDB=None, reload=0, wordlist=None): - # Unless reload is indicated, do not load twice - if self.index_loaded() and not reload: return 0 - # Ok, now let's actually load it - import cPickle, zlib - INDEXDB = INDEXDB or self.indexdb - try: - pickle_str = zlib.decompress(open(INDEXDB+'!','rb').read()) - db = cPickle.loads(pickle_str) - except: # New index - db = Index({}, {'_TOP':(0,None)}, {}) - self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS - - def save_index(self, INDEXDB=None): - import cPickle, zlib - INDEXDB = INDEXDB or self.indexdb - db = Index(self.words, self.files, self.fileids) - pickle_fh = open(INDEXDB+'!','wb') - pickle_fh.write(zlib.compress(cPickle.dumps(db, 1))) - - -class SlicedZPickleIndexer(ZPickleIndexer): segments = "ABCDEFGHIJKLMNOPQRSTUVWXYZ#-!" - def load_index(self, INDEXDB=None, reload=0, wordlist=None): + def load_index(self, reload=0, wordlist=None): # Unless reload is indicated, do not load twice - if self.index_loaded() and not reload: return 0 + if self.index_loaded() and not reload: + return 0 + # Ok, now let's actually load it - import cPickle, zlib - INDEXDB = INDEXDB or self.indexdb - db = Index({}, {'_TOP':(0,None)}, {}) + db = {'WORDS': {}, 'FILES': {'_TOP':(0,None)}, 'FILEIDS': {}} + # Identify the relevant word-dictionary segments if not wordlist: segments = self.segments else: segments = ['-','#'] for word in wordlist: - segments.append(string.upper(word[0])) + segments.append(word[0].upper()) + # Load the segments for segment in segments: try: - pickle_str = zlib.decompress(open(INDEXDB+segment,'rb').read()) - dbslice = cPickle.loads(pickle_str) - if dbslice.__dict__.get('WORDS'): # If it has some words, add them - for word,entry in dbslice.WORDS.items(): - db.WORDS[word] = entry - if dbslice.__dict__.get('FILES'): # If it has some files, add them - db.FILES = dbslice.FILES - if dbslice.__dict__.get('FILEIDS'): # If it has fileids, add them - db.FILEIDS = dbslice.FILEIDS - except: - pass # No biggie, couldn't find this segment - self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS - - def julienne(self, INDEXDB=None): - import cPickle, zlib - INDEXDB = INDEXDB or self.indexdb - segments = self.segments # all the (little) indexes - for segment in segments: - try: # brutal space saver... delete all the small segments - os.remove(INDEXDB+segment) + f = open(self.indexdb + segment, 'rb') + except IOError, error: + if error.errno != errno.ENOENT: + raise + else: + pickle_str = zlib.decompress(f.read()) + f.close() + dbslice = marshal.loads(pickle_str) + if dbslice.get('WORDS'): + # if it has some words, add them + for word, entry in dbslice['WORDS'].items(): + db['WORDS'][word] = entry + if dbslice.get('FILES'): + # if it has some files, add them + db['FILES'] = dbslice['FILES'] + if dbslice.get('FILEIDS'): + # if it has fileids, add them + db['FILEIDS'] = dbslice['FILEIDS'] + + self.words = db['WORDS'] + self.files = db['FILES'] + self.fileids = db['FILEIDS'] + + def save_index(self): + # brutal space saver... delete all the small segments + for segment in self.segments: + try: + os.remove(self.indexdb + segment) except OSError: - pass # probably just nonexistent segment index file + # probably just nonexistent segment index file + # TODO: make sure it's an EEXIST + pass + # First write the much simpler filename/fileid dictionaries - dbfil = Index(None, self.files, self.fileids) - open(INDEXDB+'-','wb').write(zlib.compress(cPickle.dumps(dbfil,1))) + dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids} + open(self.indexdb+'-','wb').write(zlib.compress(marshal.dumps(dbfil))) + # The hard part is splitting the word dictionary up, of course - letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ#" segdicts = {} # Need batch of empty dicts - for segment in letters+'#': + for segment in letters: segdicts[segment] = {} for word, entry in self.words.items(): # Split into segment dicts - initchar = string.upper(word[0]) - if initchar in letters: - segdicts[initchar][word] = entry - else: - segdicts['#'][word] = entry - for initchar in letters+'#': - db = Index(segdicts[initchar], None, None) - pickle_str = cPickle.dumps(db, 1) - filename = INDEXDB+initchar - pickle_fh = open(filename,'wb') + initchar = word[0].upper() + segdicts[initchar][word] = entry + + # save + for initchar in letters: + db = {'WORDS':segdicts[initchar], 'FILES':None, 'FILEIDS':None} + pickle_str = marshal.dumps(db) + filename = self.indexdb + initchar + pickle_fh = open(filename, 'wb') pickle_fh.write(zlib.compress(pickle_str)) - os.chmod(filename,0664) - - save_index = julienne - -PreferredIndexer = SlicedZPickleIndexer - -#-- If called from command-line, parse arguments and take actions -if __name__ == '__main__': - import time - start = time.time() - search_words = [] # Word search list (if specified) - opts = 0 # Any options specified? - if len(sys.argv) < 2: - pass # No options given - else: - upper = string.upper - dir = os.getcwd() # Default to indexing from current directory - descend = 1 # Default to recursive indexing - ndx = PreferredIndexer() - for opt in sys.argv[1:]: - if opt in ('-h','/h','-?','/?','?','--help'): # help screen - print __shell_usage__ - opts = -1 - break - elif opt[0] in '/-': # a switch! - opts = opts+1 - if upper(opt[1:]) == 'INDEX': # Index files - ndx.quiet = 0 - pass # Use defaults if no other options - elif upper(opt[1:]) == 'REINDEX': # Reindex - ndx.reindex = 1 - elif upper(opt[1:]) == 'CASESENSITIVE': # Case sensitive - ndx.casesensitive = 1 - elif upper(opt[1:]) in ('NORECURSE','LOCAL'): # No recursion - descend = 0 - elif upper(opt[1:4]) == 'DIR': # Dir to index - dir = opt[5:] - elif upper(opt[1:8]) == 'INDEXDB': # Index specified - ndx.indexdb = opt[9:] - sys.stderr.write( - "Use of INDEXER_DB environment variable is STRONGLY recommended.\n") - elif upper(opt[1:6]) == 'REGEX': # RegEx files to index - ndx.add_pattern = re.compile(opt[7:]) - elif upper(opt[1:5]) == 'GLOB': # Glob files to index - ndx.add_pattern = opt[6:] - elif upper(opt[1:7]) in ('OUTPUT','FORMAT'): # How should results look? - opts = opts-1 # this is not an option for indexing purposes - level = upper(opt[8:]) - if level in ('ALL','EVERYTHING','VERBOSE', 'MAX'): - ndx.quiet = 0 - elif level in ('RATINGS','SCORES','HIGH'): - ndx.quiet = 3 - elif level in ('FILENAMES','NAMES','FILES','MID'): - ndx.quiet = 5 - elif level in ('SUMMARY','MIN'): - ndx.quiet = 9 - elif upper(opt[1:7]) == 'FILTER': # Regex filter output - opts = opts-1 # this is not an option for indexing purposes - ndx.filter = opt[8:] - elif opt[1:] in string.digits: - opts = opts-1 - ndx.quiet = eval(opt[1]) - else: - search_words.append(opt) # Search words - - if opts > 0: - ndx.add_files(dir=dir) - ndx.save_index() - if search_words: - ndx.find(search_words, print_report=1) - if not opts and not search_words: - sys.stderr.write("Perhaps you would like to use the --help option?\n") - else: - sys.stderr.write('Processed in %.3f seconds (%s)' - % (time.time()-start, ndx.whoami())) + os.chmod(filename, 0664) + + def purge_entry(self, fname, file_dct, word_dct): + ''' Remove a file from file index and word index + ''' + try: # The easy part, cleanup the file index + file_index = file_dct[fname] + del file_dct[fname] + except KeyError: + pass # We'll assume we only encounter KeyError's + # The much harder part, cleanup the word index + for word, occurs in word_dct.items(): + if occurs.has_key(file_index): + del occurs[file_index] + word_dct[word] = occurs + + def index_loaded(self): + return (hasattr(self,'fileids') and hasattr(self,'files') and + hasattr(self,'words')) # #$Log: not supported by cvs2svn $ -#Revision 1.1.2.3 2002/04/03 12:05:15 rochecompaan -#Removed dos control characters. +#Revision 1.2 2002/05/25 07:16:24 rochecompaan +#Merged search_indexing-branch with HEAD +# +#Revision 1.1.2.3 2002/05/02 11:52:12 rochecompaan +#Fixed small bug that prevented indexes from being generated. +# +#Revision 1.1.2.2 2002/04/19 19:54:42 rochecompaan +#cgi_client.py +# removed search link for the time being +# moved rendering of matches to htmltemplate +#hyperdb.py +# filtering of nodes on full text search incorporated in filter method +#roundupdb.py +# added paramater to call of filter method +#roundup_indexer.py +# added search method to RoundupIndexer class # -#Revision 1.1.2.2 2002/04/03 12:01:55 rochecompaan -#Oops. Forgot to include cvs keywords in file. +#Revision 1.1.2.1 2002/04/03 11:55:57 rochecompaan +# . Added feature #526730 - search for messages capability # diff --git a/roundup/roundup_indexer.py b/roundup/roundup_indexer.py deleted file mode 100644 index a66261c..0000000 --- a/roundup/roundup_indexer.py +++ /dev/null @@ -1,98 +0,0 @@ -# -# Copyright (c) 2001 Bizar Software Pty Ltd (http://www.bizarsoftware.com.au/) -# This module is free software, and you may redistribute it and/or modify -# under the same terms as Python, so long as this copyright message and -# disclaimer are retained in their original form. -# -# IN NO EVENT SHALL BIZAR SOFTWARE PTY LTD BE LIABLE TO ANY PARTY FOR -# DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING -# OUT OF THE USE OF THIS CODE, EVEN IF THE AUTHOR HAS BEEN ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. -# -# BIZAR SOFTWARE PTY LTD SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, -# BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -# FOR A PARTICULAR PURPOSE. THE CODE PROVIDED HEREUNDER IS ON AN "AS IS" -# BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE, -# SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. -# -#$Id: roundup_indexer.py,v 1.2 2002-05-25 07:16:24 rochecompaan Exp $ -''' -This module provides an indexer class, RoundupIndexer, that stores text -indices in a roundup instance. This class makes searching the content of -messages and text files possible. -''' -import os -from roundup.indexer import SlicedZPickleIndexer - -class RoundupIndexer(SlicedZPickleIndexer): - ''' Indexes messages and files - ''' - - def __init__(self, db_path): - indexdb_path = os.path.join(db_path, 'indexes') - index_exists = 0 - if not os.path.exists(indexdb_path): - os.makedirs(indexdb_path) - os.chmod(indexdb_path, 0775) - else: - index_exists = 1 - index_path = os.path.join(indexdb_path, 'index.db') - SlicedZPickleIndexer.__init__(self, - INDEXDB=index_path, QUIET=9) - files_path = os.path.join(db_path, 'files') - if not index_exists: - self.add_files(dir=files_path) - self.save_index() - - def search(self, search_terms, klass): - ''' display search results - ''' - hits = self.find(search_terms) - links = [] - nodeids = {} - designator_propname = {'msg': 'messages', - 'file': 'files'} - if hits: - hitcount = len(hits) - # build a dictionary of nodes and their associated messages - # and files - for hit in hits.keys(): - filename = hits[hit].split('/')[-1] - for designator, propname in designator_propname.items(): - if filename.find(designator) == -1: continue - nodeid = filename[len(designator):] - result = apply(klass.find, (), {propname:nodeid}) - if not result: continue - - id = str(result[0]) - if not nodeids.has_key(id): - nodeids[id] = {} - - node_dict = nodeids[id] - if not node_dict.has_key(propname): - node_dict[propname] = [nodeid] - elif node_dict.has_key(propname): - node_dict[propname].append(nodeid) - - return nodeids - - -# -#$Log: not supported by cvs2svn $ -#Revision 1.1.2.3 2002/05/02 11:52:12 rochecompaan -#Fixed small bug that prevented indexes from being generated. -# -#Revision 1.1.2.2 2002/04/19 19:54:42 rochecompaan -#cgi_client.py -# removed search link for the time being -# moved rendering of matches to htmltemplate -#hyperdb.py -# filtering of nodes on full text search incorporated in filter method -#roundupdb.py -# added paramater to call of filter method -#roundup_indexer.py -# added search method to RoundupIndexer class -# -#Revision 1.1.2.1 2002/04/03 11:55:57 rochecompaan -# . Added feature #526730 - search for messages capability -#