From 0aa09748fed02f351c030b3e1b2e00a9eda2f472 Mon Sep 17 00:00:00 2001 From: rochecompaan Date: Sat, 25 May 2002 07:16:25 +0000 Subject: [PATCH] Merged search_indexing-branch with HEAD git-svn-id: http://svn.roundup-tracker.org/svnroot/roundup/trunk@762 57a73879-2fb5-44c3-a270-3262357dd7e2 --- CHANGES.txt | 1 + roundup/backends/back_anydbm.py | 18 +- roundup/cgi_client.py | 97 ++- roundup/htmltemplate.py | 169 ++-- roundup/hyperdb.py | 37 +- roundup/indexer.py | 790 ++++++++++++++++++ roundup/roundup_indexer.py | 98 +++ roundup/roundupdb.py | 24 +- roundup/templates/classic/instance_config.py | 36 +- roundup/templates/extended/instance_config.py | 35 +- run_tests | 11 +- 11 files changed, 1247 insertions(+), 69 deletions(-) create mode 100644 roundup/indexer.py create mode 100644 roundup/roundup_indexer.py diff --git a/CHANGES.txt b/CHANGES.txt index 51d1297..ec62d75 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -32,6 +32,7 @@ Feature: . applied patch #558876 ] cgi client customization . split instance initialisation into two steps, allowing config changes before the database is initialised. + . #526730 ] search for messages capability Fixed: . stop sending blank (whitespace-only) notes diff --git a/roundup/backends/back_anydbm.py b/roundup/backends/back_anydbm.py index 0ee8fbb..5ea64da 100644 --- a/roundup/backends/back_anydbm.py +++ b/roundup/backends/back_anydbm.py @@ -15,7 +15,7 @@ # BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE, # SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. # -#$Id: back_anydbm.py,v 1.34 2002-05-15 06:21:21 richard Exp $ +#$Id: back_anydbm.py,v 1.35 2002-05-25 07:16:24 rochecompaan Exp $ ''' This module defines a backend that saves the hyperdatabase in a database chosen by anydbm. It is guaranteed to always be available in python @@ -26,6 +26,7 @@ serious bugs, and is not available) import whichdb, anydbm, os, marshal from roundup import hyperdb, date from blobfiles import FileStorage +from roundup.roundup_indexer import RoundupIndexer from locking import acquire_lock, release_lock # @@ -61,6 +62,7 @@ class Database(FileStorage, hyperdb.Database): self.dirtynodes = {} # keep track of the dirty nodes by class self.newnodes = {} # keep track of the new nodes by class self.transactions = [] + self.indexer = RoundupIndexer(self.dir) # ensure files are group readable and writable os.umask(0002) @@ -467,6 +469,9 @@ class Database(FileStorage, hyperdb.Database): def _doStoreFile(self, name, **databases): # the file is currently ".tmp" - move it to its real name to commit os.rename(name+".tmp", name) + pattern = name.split('/')[-1] + self.indexer.add_files(dir=os.path.dirname(name), pattern=pattern) + self.indexer.save_index() def rollback(self): ''' Reverse all actions from the current transaction. @@ -485,6 +490,14 @@ class Database(FileStorage, hyperdb.Database): # #$Log: not supported by cvs2svn $ +#Revision 1.34 2002/05/15 06:21:21 richard +# . node caching now works, and gives a small boost in performance +# +#As a part of this, I cleaned up the DEBUG output and implemented TRACE +#output (HYPERDBTRACE='file to trace to') with checkpoints at the start of +#CGI requests. Run roundup with python -O to skip all the DEBUG/TRACE stuff +#(using if __debug__ which is compiled out with -O) +# #Revision 1.33 2002/04/24 10:38:26 rochecompaan #All database files are now created group readable and writable. # @@ -502,6 +515,9 @@ class Database(FileStorage, hyperdb.Database): # #Unit tests for all of the above written. # +#Revision 1.30.2.1 2002/04/03 11:55:57 rochecompaan +# . Added feature #526730 - search for messages capability +# #Revision 1.30 2002/02/27 03:40:59 richard #Ran it through pychecker, made fixes # diff --git a/roundup/cgi_client.py b/roundup/cgi_client.py index fcefab9..8fb4139 100644 --- a/roundup/cgi_client.py +++ b/roundup/cgi_client.py @@ -15,7 +15,7 @@ # BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE, # SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. # -# $Id: cgi_client.py,v 1.124 2002-05-24 02:09:24 richard Exp $ +# $Id: cgi_client.py,v 1.125 2002-05-25 07:16:24 rochecompaan Exp $ __doc__ = """ WWW request handler (also used in the stand-alone server). @@ -26,6 +26,7 @@ import binascii, Cookie, time, random import roundupdb, htmltemplate, date, hyperdb, password from roundup.i18n import _ +from roundup_indexer import RoundupIndexer class Unauthorised(ValueError): pass @@ -71,6 +72,7 @@ class Client: except ValueError: # someone gave us a non-int debug level, turn it off self.debug = 0 + self.indexer = RoundupIndexer('%s/db'%instance.INSTANCE_HOME) def getuid(self): return self.db.user.lookup(self.user) @@ -212,6 +214,17 @@ function help_window(helpurl, width, height) { links.append(_('User List')) links.append(_('Add User')) + # add the search links + if hasattr(self.instance, 'HEADER_SEARCH_LINKS'): + classes = self.instance.HEADER_SEARCH_LINKS + else: + classes = ['issue'] + l = [] + for class_name in classes: + cap_class = class_name.capitalize() + links.append(_('Search ' + '%(cap_class)s')%locals()) + # now we have all the links, join 'em links = '\n | '.join(links) @@ -331,9 +344,7 @@ function help_window(helpurl, width, height) { default_index_columns = ['id','activity','title','status','assignedto'] default_index_filterspec = {'status': ['1', '2', '3', '4', '5', '6', '7']} - def index(self): - ''' put up an index - no class specified - ''' + def _get_customisation_info(self): # see if the web has supplied us with any customisation info defaults = 1 for key in ':sort', ':group', ':filter', ':columns': @@ -363,13 +374,39 @@ function help_window(helpurl, width, height) { # make list() extract the info from the CGI environ self.classname = 'issue' sort = group = filter = columns = filterspec = None + return columns, filter, group, sort, filterspec + + def index(self): + ''' put up an index - no class specified + ''' + columns, filter, group, sort, filterspec = \ + self._get_customisation_info() return self.list(columns=columns, filter=filter, group=group, sort=sort, filterspec=filterspec) + def searchnode(self): + columns, filter, group, sort, filterspec = \ + self._get_customisation_info() + show_nodes = 1 + if len(self.form.keys()) == 0: + # get the default search filters from instance_config + if hasattr(self.instance, 'SEARCH_FILTERS'): + for f in self.instance.SEARCH_FILTERS: + spec = getattr(self.instance, f) + if spec['CLASS'] == self.classname: + filter = spec['FILTER'] + + show_nodes = 0 + show_customization = 1 + return self.list(columns=columns, filter=filter, group=group, + sort=sort, filterspec=filterspec, + show_customization=show_customization, show_nodes=show_nodes) + + # XXX deviates from spec - loses the '+' (that's a reserved character # in URLS def list(self, sort=None, group=None, filter=None, columns=None, - filterspec=None, show_customization=None): + filterspec=None, show_customization=None, show_nodes=1): ''' call the template index with the args :sort - sort by prop name, optionally preceeded with '-' @@ -393,11 +430,16 @@ function help_window(helpurl, width, height) { if filterspec is None: filterspec = self.index_filterspec(filter) if show_customization is None: show_customization = self.customization_widget() + if self.form.has_key('search_text'): + search_text = self.form['search_text'].value + else: + search_text = '' index = htmltemplate.IndexTemplate(self, self.instance.TEMPLATES, cn) try: - index.render(filterspec, filter, columns, sort, group, - show_customization=show_customization) + index.render(filterspec, search_text, filter, columns, sort, + group, show_customization=show_customization, + show_nodes=show_nodes) except htmltemplate.MissingTemplateError: self.basicClassEditPage() self.pagefoot() @@ -560,6 +602,7 @@ function help_window(helpurl, width, height) { self.pagefoot() showissue = shownode showmsg = shownode + searchissue = searchnode def _add_author_to_nosy(self, props): ''' add the author value from the props to the nosy list @@ -1243,7 +1286,7 @@ function help_window(helpurl, width, height) { self.db.commit() def do_action(self, action, dre=re.compile(r'([^\d]+)(\d+)'), - nre=re.compile(r'new(\w+)')): + nre=re.compile(r'new(\w+)'), sre=re.compile(r'search(\w+)')): '''Figure the user's action and do it. ''' # here be the "normal" functionality @@ -1294,6 +1337,17 @@ function help_window(helpurl, width, height) { func() return + # see if we're to put up the new node page + m = sre.match(action) + if m: + self.classname = m.group(1) + try: + func = getattr(self, 'search%s'%self.classname) + except AttributeError: + raise NotFound + func() + return + # otherwise, display the named class self.classname = action try: @@ -1311,6 +1365,7 @@ class ExtendedClient(Client): showtimelog = Client.shownode newsupport = Client.newnode newtimelog = Client.newnode + searchsupport = Client.searchnode default_index_sort = ['-activity'] default_index_group = ['priority'] @@ -1399,6 +1454,9 @@ def parsePropsFromForm(db, cl, form, nodeid=0): # # $Log: not supported by cvs2svn $ +# Revision 1.124 2002/05/24 02:09:24 richard +# Nothing like a live demo to show up the bugs ;) +# # Revision 1.123 2002/05/22 05:04:13 richard # Oops # @@ -1434,6 +1492,29 @@ def parsePropsFromForm(db, cl, form, nodeid=0): # Revision 1.115 2002/04/02 01:56:10 richard # . stop sending blank (whitespace-only) notes # +# Revision 1.114.2.4 2002/05/02 11:49:18 rochecompaan +# Allow customization of the search filters that should be displayed +# on the search page. +# +# Revision 1.114.2.3 2002/04/20 13:23:31 rochecompaan +# We now have a separate search page for nodes. Search links for +# different classes can be customized in instance_config similar to +# index links. +# +# Revision 1.114.2.2 2002/04/19 19:54:42 rochecompaan +# cgi_client.py +# removed search link for the time being +# moved rendering of matches to htmltemplate +# hyperdb.py +# filtering of nodes on full text search incorporated in filter method +# roundupdb.py +# added paramater to call of filter method +# roundup_indexer.py +# added search method to RoundupIndexer class +# +# Revision 1.114.2.1 2002/04/03 11:55:57 rochecompaan +# . Added feature #526730 - search for messages capability +# # Revision 1.114 2002/03/17 23:06:05 richard # oops # diff --git a/roundup/htmltemplate.py b/roundup/htmltemplate.py index 2f6557a..b5b9987 100644 --- a/roundup/htmltemplate.py +++ b/roundup/htmltemplate.py @@ -15,7 +15,7 @@ # BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE, # SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. # -# $Id: htmltemplate.py,v 1.89 2002-05-15 06:34:47 richard Exp $ +# $Id: htmltemplate.py,v 1.90 2002-05-25 07:16:24 rochecompaan Exp $ __doc__ = """ Template engine. @@ -714,7 +714,8 @@ class IndexTemplateReplace: def go(self, text): return self.replace.sub(self, text) - def __call__(self, m, filter=None, columns=None, sort=None, group=None): + def __call__(self, m, search_text=None, filter=None, columns=None, + sort=None, group=None): if m.group('name'): if m.group('name') in self.props: text = m.group('text') @@ -743,8 +744,9 @@ class IndexTemplate(TemplateFunctions): self.properties = self.cl.getprops() col_re=re.compile(r']+)">') - def render(self, filterspec={}, filter=[], columns=[], sort=[], group=[], - show_display_form=1, nodeids=None, show_customization=1): + def render(self, filterspec={}, search_text='', filter=[], columns=[], + sort=[], group=[], show_display_form=1, nodeids=None, + show_customization=1, show_nodes=1): self.filterspec = filterspec w = self.client.write @@ -784,8 +786,8 @@ class IndexTemplate(TemplateFunctions): if (show_display_form and self.instance.FILTER_POSITION in ('top and bottom', 'top')): w('
\n'%self.classname) - self.filter_section(filter_template, filter, columns, group, - all_filters, all_columns, show_customization) + self.filter_section(filter_template, search_text, filter, + columns, group, all_filters, all_columns, show_customization) # make sure that the sorting doesn't get lost either if sort: w(''% @@ -817,49 +819,58 @@ class IndexTemplate(TemplateFunctions): # now actually loop through all the nodes we get from the filter and # apply the template - if nodeids is None: - nodeids = self.cl.filter(filterspec, sort, group) - for nodeid in nodeids: - # check for a group heading - if group_names: - this_group = [self.cl.get(nodeid, name, _('[no value]')) - for name in group_names] - if this_group != old_group: - l = [] - for name in group_names: - prop = self.properties[name] - if isinstance(prop, hyperdb.Link): - group_cl = self.db.classes[prop.classname] - key = group_cl.getkey() - value = self.cl.get(nodeid, name) - if value is None: - l.append(_('[unselected %(classname)s]')%{ - 'classname': prop.classname}) - else: - l.append(group_cl.get(self.cl.get(nodeid, - name), key)) - elif isinstance(prop, hyperdb.Multilink): - group_cl = self.db.classes[prop.classname] - key = group_cl.getkey() - for value in self.cl.get(nodeid, name): - l.append(group_cl.get(value, key)) - else: - value = self.cl.get(nodeid, name, _('[no value]')) - if value is None: - value = _('[empty %(name)s]')%locals() + if show_nodes: + matches = None + if nodeids is None: + if search_text != '': + matches = self.client.indexer.search( + search_text.split(' '), self.cl) + nodeids = self.cl.filter(matches, filterspec, sort, group) + for nodeid in nodeids: + # check for a group heading + if group_names: + this_group = [self.cl.get(nodeid, name, _('[no value]')) + for name in group_names] + if this_group != old_group: + l = [] + for name in group_names: + prop = self.properties[name] + if isinstance(prop, hyperdb.Link): + group_cl = self.db.classes[prop.classname] + key = group_cl.getkey() + value = self.cl.get(nodeid, name) + if value is None: + l.append(_('[unselected %(classname)s]')%{ + 'classname': prop.classname}) + else: + l.append(group_cl.get(self.cl.get(nodeid, + name), key)) + elif isinstance(prop, hyperdb.Multilink): + group_cl = self.db.classes[prop.classname] + key = group_cl.getkey() + for value in self.cl.get(nodeid, name): + l.append(group_cl.get(value, key)) else: - value = str(value) - l.append(value) - w('' - '%s'%( - len(columns), ', '.join(l))) - old_group = this_group - - # display this node's row - replace = IndexTemplateReplace(self.globals, locals(), columns) - self.nodeid = nodeid - w(replace.go(template)) - self.nodeid = None + value = self.cl.get(nodeid, name, + _('[no value]')) + if value is None: + value = _('[empty %(name)s]')%locals() + else: + value = str(value) + l.append(value) + w('' + '' + '%s'%( + len(columns), ', '.join(l))) + old_group = this_group + + # display this node's row + replace = IndexTemplateReplace(self.globals, locals(), columns) + self.nodeid = nodeid + w(replace.go(template)) + if matches: + self.node_matches(matches[nodeid], len(columns)) + self.nodeid = None w('') @@ -867,17 +878,46 @@ class IndexTemplate(TemplateFunctions): if (show_display_form and hasattr(self.instance, 'FILTER_POSITION') and self.instance.FILTER_POSITION in ('top and bottom', 'bottom')): w('\n'%self.classname) - self.filter_section(filter_template, filter, columns, group, - all_filters, all_columns, show_customization) + self.filter_section(filter_template, search_text, filter, + columns, group, all_filters, all_columns, show_customization) # make sure that the sorting doesn't get lost either if sort: w(''% ','.join(sort)) w('
\n') + def node_matches(self, match, colspan): + ''' display the files and messages for a node that matched a + full text search + ''' + w = self.client.write - def filter_section(self, template, filter, columns, group, all_filters, - all_columns, show_customization): + message_links = [] + file_links = [] + if match.has_key('messages'): + for msgid in match['messages']: + k = self.db.msg.labelprop() + lab = self.db.msg.get(msgid, k) + msgpath = 'msg%s'%msgid + message_links.append('%(lab)s' + %locals()) + w(_('' + '  Matched messages: %s')%( + colspan, ', '.join(message_links))) + + if match.has_key('files'): + for fileid in match['files']: + filename = self.db.file.get(fileid, 'name') + filepath = 'file%s/%s'%(fileid, filename) + file_links.append('%(filename)s' + %locals()) + w(_('' + '  Matched files: %s')%( + colspan, ', '.join(file_links))) + + + def filter_section(self, template, search_text, filter, columns, group, + all_filters, all_columns, show_customization): w = self.client.write @@ -891,6 +931,11 @@ class IndexTemplate(TemplateFunctions): w('') w(_(' Filter specification...')) w('') + w('') + w('Search terms') + w(''%( + search_text)) + w('') replace = IndexTemplateReplace(self.globals, locals(), filter) w(replace.go(template)) w(' ') @@ -1022,6 +1067,7 @@ class IndexTemplate(TemplateFunctions): w(':sort=%s'%','.join(m[:2])) return '&'.join(l) + # # ITEM TEMPLATES # @@ -1124,6 +1170,9 @@ class NewItemTemplate(TemplateFunctions): # # $Log: not supported by cvs2svn $ +# Revision 1.89 2002/05/15 06:34:47 richard +# forgot to fix the templating for last change +# # Revision 1.88 2002/04/24 08:34:35 rochecompaan # Sorting was applied to all nodes of the MultiLink class instead of # the nodes that are actually linked to in the "field" template @@ -1147,6 +1196,22 @@ class NewItemTemplate(TemplateFunctions): # text. The link value is displayed as a tooltip using the title anchor # attribute. # +# Revision 1.84.2.2 2002/04/20 13:23:32 rochecompaan +# We now have a separate search page for nodes. Search links for +# different classes can be customized in instance_config similar to +# index links. +# +# Revision 1.84.2.1 2002/04/19 19:54:42 rochecompaan +# cgi_client.py +# removed search link for the time being +# moved rendering of matches to htmltemplate +# hyperdb.py +# filtering of nodes on full text search incorporated in filter method +# roundupdb.py +# added paramater to call of filter method +# roundup_indexer.py +# added search method to RoundupIndexer class +# # Revision 1.84 2002/03/29 19:41:48 rochecompaan # . Fixed display of mutlilink properties when using the template # functions, menu and plain. diff --git a/roundup/hyperdb.py b/roundup/hyperdb.py index bf532d2..776704c 100644 --- a/roundup/hyperdb.py +++ b/roundup/hyperdb.py @@ -15,7 +15,7 @@ # BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE, # SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. # -# $Id: hyperdb.py,v 1.65 2002-05-22 04:12:05 richard Exp $ +# $Id: hyperdb.py,v 1.66 2002-05-25 07:16:24 rochecompaan Exp $ __doc__ = """ Hyperdatabase implementation, especially field types. @@ -845,7 +845,8 @@ class Class: return l # XXX not in spec - def filter(self, filterspec, sort, group, num_re = re.compile('^\d+$')): + def filter(self, search_matches, filterspec, sort, group, + num_re = re.compile('^\d+$')): ''' Return a list of the ids of the active nodes in this class that match the 'filter' spec, sorted by the group spec and then the sort spec @@ -935,6 +936,16 @@ class Class: l.append((nodeid, node)) l.sort() + # filter based on full text search + if search_matches is not None: + k = [] + l_debug = [] + for v in l: + l_debug.append(v[0]) + if search_matches.has_key(v[0]): + k.append(v) + l = k + # optimise sort m = [] for entry in sort: @@ -1146,6 +1157,12 @@ def Choice(name, db, *options): # # $Log: not supported by cvs2svn $ +# Revision 1.65 2002/05/22 04:12:05 richard +# . applied patch #558876 ] cgi client customization +# ... with significant additions and modifications ;) +# - extended handling of ML assignedto to all places it's handled +# - added more NotFound info +# # Revision 1.64 2002/05/15 06:21:21 richard # . node caching now works, and gives a small boost in performance # @@ -1175,6 +1192,22 @@ def Choice(name, db, *options): # # Unit tests for all of the above written. # +# Revision 1.59.2.2 2002/04/20 13:23:33 rochecompaan +# We now have a separate search page for nodes. Search links for +# different classes can be customized in instance_config similar to +# index links. +# +# Revision 1.59.2.1 2002/04/19 19:54:42 rochecompaan +# cgi_client.py +# removed search link for the time being +# moved rendering of matches to htmltemplate +# hyperdb.py +# filtering of nodes on full text search incorporated in filter method +# roundupdb.py +# added paramater to call of filter method +# roundup_indexer.py +# added search method to RoundupIndexer class +# # Revision 1.59 2002/03/12 22:52:26 richard # more pychecker warnings removed # diff --git a/roundup/indexer.py b/roundup/indexer.py new file mode 100644 index 0000000..47f0120 --- /dev/null +++ b/roundup/indexer.py @@ -0,0 +1,790 @@ +#!/usr/bin/env python + +"""Create full-text indexes and search them + +Notes: + + See http://gnosis.cx/publish/programming/charming_python_15.txt + for a detailed discussion of this module. + + This version requires Python 1.6+. It turns out that the use + of string methods rather than [string] module functions is + enough faster in a tight loop so as to provide a quite + remarkable 25% speedup in overall indexing. However, only FOUR + lines in TextSplitter.text_splitter() were changed away from + Python 1.5 compatibility. Those lines are followed by comments + beginning with "# 1.52: " that show the old forms. Python + 1.5 users can restore these lines, and comment out those just + above them. + +Classes: + + GenericIndexer -- Abstract class + TextSplitter -- Mixin class + Index + ShelveIndexer + FlatIndexer + XMLPickleIndexer + PickleIndexer + ZPickleIndexer + SlicedZPickleIndexer + +Functions: + + echo_fname(fname) + recurse_files(...) + +Index Formats: + + *Indexer.files: filename --> (fileid, wordcount) + *Indexer.fileids: fileid --> filename + *Indexer.words: word --> {fileid1:occurs, fileid2:occurs, ...} + +Module Usage: + + There are a few ways to use this module. Just to utilize existing + functionality, something like the following is a likely + pattern: + + import gnosis.indexer as indexer + index = indexer.MyFavoriteIndexer() # For some concrete Indexer + index.load_index('myIndex.db') + index.add_files(dir='/this/that/otherdir', pattern='*.txt') + hits = index.find(['spam','eggs','bacon']) + index.print_report(hits) + + To customize the basic classes, something like the following is likely: + + class MySplitter: + def splitter(self, text, ftype): + "Peform much better splitting than default (for filetypes)" + # ... + return words + + class MyIndexer(indexer.GenericIndexer, MySplitter): + def load_index(self, INDEXDB=None): + "Retrieve three dictionaries from clever storage method" + # ... + self.words, self.files, self.fileids = WORDS, FILES, FILEIDS + def save_index(self, INDEXDB=None): + "Save three dictionaries to clever storage method" + + index = MyIndexer() + # ...etc... + +Benchmarks: + + As we know, there are lies, damn lies, and benchmarks. Take + the below with an adequate dose of salt. In version 0.10 of + the concrete indexers, some performance was tested. The + test case was a set of mail/news archives, that were about + 43 mB, and 225 files. In each case, an index was generated + (if possible), and a search for the words "xml python" was + performed. + + - Index w/ PickleIndexer: 482s, 2.4 mB + - Search w/ PickleIndexer: 1.74s + - Index w/ ZPickleIndexer: 484s, 1.2 mB + - Search w/ ZPickleIndexer: 1.77s + - Index w/ FlatIndexer: 492s, 2.6 mB + - Search w/ FlatIndexer: 53s + - Index w/ ShelveIndexer: (dumbdbm) Many minutes, tens of mBs + - Search w/ ShelveIndexer: Aborted before completely indexed + - Index w/ ShelveIndexer: (dbhash) Long time (partial crash), 10 mB + - Search w/ ShelveIndexer: N/A. Too many glitches + - Index w/ XMLPickleIndexer: Memory error (xml_pickle uses bad string + composition for large output) + - Search w/ XMLPickleIndexer: N/A + - grep search (xml|python): 20s (cached: <5s) + - 'srch' utility (python): 12s +""" +#$Id: indexer.py,v 1.2 2002-05-25 07:16:24 rochecompaan Exp $ + +__shell_usage__ = """ +Shell Usage: [python] indexer.py [options] [search_words] + + -h, /h, -?, /?, ?, --help: Show this help screen + -index: Add files to index + -reindex: Refresh files already in the index + (can take much more time) + -casesensitive: Maintain the case of indexed words + (can lead to MUCH larger indices) + -norecurse, -local: Only index starting dir, not subdirs + -dir=: Starting directory for indexing + (default is current directory) + -indexdb=: Use specified index database + (environ variable INDEXER_DB is preferred) + -regex=: Index files matching regular expression + -glob=: Index files matching glob pattern + -filter= Only display results matching pattern + -output=, -format=: How much detail on matches? + -: Quiet level (0=verbose ... 9=quiet) + +Output/format options are ALL/EVERYTHING/VERBOSE, RATINGS/SCORES, +FILENAMES/NAMES/FILES, SUMMARY/REPORT""" + +__version__ = "$Revision: 1.2 $" +__author__=["David Mertz (mertz@gnosis.cx)",] +__thanks_to__=["Pat Knight (p.knight@ktgroup.co.uk)", + "Gregory Popovitch (greg@gpy.com)", ] +__copyright__=""" + This file is released to the public domain. I (dqm) would + appreciate it if you choose to keep derived works under terms + that promote freedom, but obviously am giving up any rights + to compel such. +""" + +__history__=""" + 0.1 Initial version. + + 0.11 Tweaked TextSplitter after some random experimentation. + + 0.12 Added SlicedZPickleIndexer (best choice, so far). + + 0.13 Pat Knight pointed out need for binary open()'s of + certain files under Windows. + + 0.14 Added '-filter' switch to search results. + + 0.15 Added direct read of gzip files + + 0.20 Gregory Popovitch did some profiling on TextSplitter, + and provided both huge speedups to the Python version + and hooks to a C extension class (ZopeTextSplitter). + A little refactoring by he and I (dqm) has nearly + doubled the speed of indexing + + 0.30 Module refactored into gnosis package. This is a + first pass, and various documentation and test cases + should be added later. +""" +import string, re, os, fnmatch, sys, copy, gzip +from types import * + +#-- Silly "do nothing" default recursive file processor +def echo_fname(fname): print fname + +#-- "Recurse and process files" utility function +def recurse_files(curdir, pattern, exclusions, func=echo_fname, *args, **kw): + "Recursively process file pattern" + subdirs, files = [],[] + level = kw.get('level',0) + + for name in os.listdir(curdir): + fname = os.path.join(curdir, name) + if name[-4:] in exclusions: + pass # do not include binary file type + elif os.path.isdir(fname) and not os.path.islink(fname): + subdirs.append(fname) + # kludge to detect a regular expression across python versions + elif sys.version[0]=='1' and isinstance(pattern, re.RegexObject): + if pattern.match(name): + files.append(fname) + elif sys.version[0]=='2' and type(pattern)==type(re.compile('')): + if pattern.match(name): + files.append(fname) + elif type(pattern) is StringType: + if fnmatch.fnmatch(name, pattern): + files.append(fname) + + for fname in files: + apply(func, (fname,)+args) + for subdir in subdirs: + recurse_files(subdir, pattern, exclusions, func, level=level+1) + +#-- Data bundle for index dictionaries +class Index: + def __init__(self, words, files, fileids): + if words is not None: self.WORDS = words + if files is not None: self.FILES = files + if fileids is not None: self.FILEIDS = fileids + +#-- "Split plain text into words" utility function +class TextSplitter: + def initSplitter(self): + prenum = string.join(map(chr, range(0,48)), '') + num2cap = string.join(map(chr, range(58,65)), '') + cap2low = string.join(map(chr, range(91,97)), '') + postlow = string.join(map(chr, range(123,256)), '') + nonword = prenum + num2cap + cap2low + postlow + self.word_only = string.maketrans(nonword, " "*len(nonword)) + self.nondigits = string.join(map(chr, range(0,48)) + map(chr, range(58,255)), '') + self.alpha = string.join(map(chr, range(65,91)) + map(chr, range(97,123)), '') + self.ident = string.join(map(chr, range(256)), '') + self.init = 1 + + def splitter(self, text, ftype): + "Split the contents of a text string into a list of 'words'" + if ftype == 'text/plain': + words = self.text_splitter(text, self.casesensitive) + else: + raise NotImplementedError + return words + + def text_splitter(self, text, casesensitive=0): + """Split text/plain string into a list of words + + In version 0.20 this function is still fairly weak at + identifying "real" words, and excluding gibberish + strings. As long as the indexer looks at "real" text + files, it does pretty well; but if indexing of binary + data is attempted, a lot of gibberish gets indexed. + Suggestions on improving this are GREATLY APPRECIATED. + """ + # Initialize some constants + if not hasattr(self,'init'): self.initSplitter() + + # Speedup trick: attributes into local scope + word_only = self.word_only + ident = self.ident + alpha = self.alpha + nondigits = self.nondigits + translate = string.translate + + # Let's adjust case if not case-sensitive + if not casesensitive: text = string.upper(text) + + # Split the raw text + allwords = string.split(text) + + # Finally, let's skip some words not worth indexing + words = [] + for word in allwords: + if len(word) > 25: continue # too long (probably gibberish) + + # Identify common patterns in non-word data (binary, UU/MIME, etc) + num_nonalpha = len(word.translate(ident, alpha)) + numdigits = len(word.translate(ident, nondigits)) + # 1.52: num_nonalpha = len(translate(word, ident, alpha)) + # 1.52: numdigits = len(translate(word, ident, nondigits)) + if numdigits > len(word)-2: # almost all digits + if numdigits > 5: # too many digits is gibberish + continue # a moderate number is year/zipcode/etc + elif num_nonalpha*3 > len(word): # too much scattered nonalpha = gibberish + continue + + word = word.translate(word_only) # Let's strip funny byte values + # 1.52: word = translate(word, word_only) + subwords = word.split() # maybe embedded non-alphanumeric + # 1.52: subwords = string.split(word) + for subword in subwords: # ...so we might have subwords + if len(subword) <= 2: continue # too short a subword + words.append(subword) + return words + +class ZopeTextSplitter: + def initSplitter(self): + import Splitter + stop_words=( + 'am', 'ii', 'iii', 'per', 'po', 're', 'a', 'about', 'above', 'across', + 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', + 'along', 'already', 'also', 'although', 'always', 'am', 'among', + 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', + 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around', + 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', + 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', + 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both', + 'bottom', 'but', 'by', 'can', 'cannot', 'cant', 'con', 'could', + 'couldnt', 'cry', 'describe', 'detail', 'do', 'done', 'down', 'due', + 'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else', + 'elsewhere', 'empty', 'enough', 'even', 'ever', 'every', 'everyone', + 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty', + 'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly', + 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get', + 'give', 'go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her', + 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', + 'herself', 'him', 'himself', 'his', 'how', 'however', 'hundred', 'i', + 'ie', 'if', 'in', 'inc', 'indeed', 'interest', 'into', 'is', 'it', + 'its', 'itself', 'keep', 'last', 'latter', 'latterly', 'least', + 'less', 'made', 'many', 'may', 'me', 'meanwhile', 'might', 'mill', + 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', + 'my', 'myself', 'name', 'namely', 'neither', 'never', 'nevertheless', + 'next', 'nine', 'no', 'nobody', 'none', 'noone', 'nor', 'not', + 'nothing', 'now', 'nowhere', 'of', 'off', 'often', 'on', 'once', + 'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'our', + 'ours', 'ourselves', 'out', 'over', 'own', 'per', 'perhaps', + 'please', 'pre', 'put', 'rather', 're', 'same', 'see', 'seem', + 'seemed', 'seeming', 'seems', 'serious', 'several', 'she', 'should', + 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some', + 'somehow', 'someone', 'something', 'sometime', 'sometimes', + 'somewhere', 'still', 'such', 'take', 'ten', 'than', 'that', 'the', + 'their', 'them', 'themselves', 'then', 'thence', 'there', + 'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these', + 'they', 'thick', 'thin', 'third', 'this', 'those', 'though', 'three', + 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', + 'toward', 'towards', 'twelve', 'twenty', 'two', 'un', 'under', + 'until', 'up', 'upon', 'us', 'very', 'via', 'was', 'we', 'well', + 'were', 'what', 'whatever', 'when', 'whence', 'whenever', 'where', + 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', + 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever', + 'whole', 'whom', 'whose', 'why', 'will', 'with', 'within', 'without', + 'would', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves', + ) + self.stop_word_dict={} + for word in stop_words: self.stop_word_dict[word]=None + self.splitterobj = Splitter.getSplitter() + self.init = 1 + + def goodword(self, word): + return len(word) < 25 + + def splitter(self, text, ftype): + """never case-sensitive""" + if not hasattr(self,'init'): self.initSplitter() + return filter(self.goodword, self.splitterobj(text, self.stop_word_dict)) + + +#-- "Abstract" parent class for inherited indexers +# (does not handle storage in parent, other methods are primitive) + +class GenericIndexer: + def __init__(self, **kw): + apply(self.configure, (), kw) + + def whoami(self): + return self.__class__.__name__ + + def configure(self, REINDEX=0, CASESENSITIVE=0, + INDEXDB=os.environ.get('INDEXER_DB', 'TEMP_NDX.DB'), + ADD_PATTERN='*', QUIET=5): + "Configure settings used by indexing and storage/retrieval" + self.indexdb = INDEXDB + self.reindex = REINDEX + self.casesensitive = CASESENSITIVE + self.add_pattern = ADD_PATTERN + self.quiet = QUIET + self.filter = None + + def add_files(self, dir=os.getcwd(), pattern=None, descend=1): + self.load_index() + exclusions = ('.zip','.pyc','.gif','.jpg','.dat','.dir') + if not pattern: + pattern = self.add_pattern + recurse_files(dir, pattern, exclusions, self.add_file) + # Rebuild the fileid index + self.fileids = {} + for fname in self.files.keys(): + fileid = self.files[fname][0] + self.fileids[fileid] = fname + + def add_file(self, fname, ftype='text/plain'): + "Index the contents of a regular file" + if self.files.has_key(fname): # Is file eligible for (re)indexing? + if self.reindex: # Reindexing enabled, cleanup dicts + self.purge_entry(fname, self.files, self.words) + else: # DO NOT reindex this file + if self.quiet < 5: print "Skipping", fname + return 0 + + # Read in the file (if possible) + try: + if fname[-3:] == '.gz': + text = gzip.open(fname).read() + else: + text = open(fname).read() + if self.quiet < 5: print "Indexing", fname + except IOError: + return 0 + words = self.splitter(text, ftype) + + # Find new file index, and assign it to filename + # (_TOP uses trick of negative to avoid conflict with file index) + self.files['_TOP'] = (self.files['_TOP'][0]-1, None) + file_index = abs(self.files['_TOP'][0]) + self.files[fname] = (file_index, len(words)) + + filedict = {} + for word in words: + if filedict.has_key(word): + filedict[word] = filedict[word]+1 + else: + filedict[word] = 1 + + for word in filedict.keys(): + if self.words.has_key(word): + entry = self.words[word] + else: + entry = {} + entry[file_index] = filedict[word] + self.words[word] = entry + + def add_othertext(self, identifier): + """Index a textual source other than a plain file + + A child class might want to implement this method (or a similar one) + in order to index textual sources such as SQL tables, URLs, clay + tablets, or whatever else. The identifier should uniquely pick out + the source of the text (whatever it is) + """ + raise NotImplementedError + + def save_index(self, INDEXDB=None): + raise NotImplementedError + + def load_index(self, INDEXDB=None, reload=0, wordlist=None): + raise NotImplementedError + + def find(self, wordlist, print_report=0): + "Locate files that match ALL the words in wordlist" + self.load_index(wordlist=wordlist) + entries = {} + hits = copy.copy(self.fileids) # Copy of fileids index + for word in wordlist: + if not self.casesensitive: + word = string.upper(word) + entry = self.words.get(word) # For each word, get index + entries[word] = entry # of matching files + if not entry: # Nothing for this one word (fail) + return 0 + for fileid in hits.keys(): # Eliminate hits for every non-match + if not entry.has_key(fileid): + del hits[fileid] + if print_report: + self.print_report(hits, wordlist, entries) + return hits + + def print_report(self, hits={}, wordlist=[], entries={}): + # Figure out what to actually print (based on QUIET level) + output = [] + for fileid,fname in hits.items(): + message = fname + if self.quiet <= 3: + wordcount = self.files[fname][1] + matches = 0 + countmess = '\n'+' '*13+`wordcount`+' words; ' + for word in wordlist: + if not self.casesensitive: + word = string.upper(word) + occurs = entries[word][fileid] + matches = matches+occurs + countmess = countmess +`occurs`+' '+word+'; ' + message = string.ljust('[RATING: ' + +`1000*matches/wordcount`+']',13)+message + if self.quiet <= 2: message = message +countmess +'\n' + if self.filter: # Using an output filter + if fnmatch.fnmatch(message, self.filter): + output.append(message) + else: + output.append(message) + + if self.quiet <= 5: + print string.join(output,'\n') + sys.stderr.write('\n'+`len(output)`+' files matched wordlist: '+ + `wordlist`+'\n') + return output + + def purge_entry(self, fname, file_dct, word_dct): + "Remove a file from file index and word index" + try: # The easy part, cleanup the file index + file_index = file_dct[fname] + del file_dct[fname] + except KeyError: + pass # We'll assume we only encounter KeyError's + # The much harder part, cleanup the word index + for word, occurs in word_dct.items(): + if occurs.has_key(file_index): + del occurs[file_index] + word_dct[word] = occurs + + def index_loaded(self): + return ( hasattr(self,'fileids') and + hasattr(self,'files') and + hasattr(self,'words') ) + +#-- Provide an actual storage facility for the indexes (i.e. shelve) +class ShelveIndexer(GenericIndexer, TextSplitter): + """Concrete Indexer utilizing [shelve] for storage + + Unfortunately, [shelve] proves far too slow in indexing, while + creating monstrously large indexes. Not recommend, at least under + the default dbm's tested. Also, class may be broken because + shelves do not, apparently, support the .values() and .items() + methods. Fixing this is a low priority, but the sample code is + left here. + """ + def load_index(self, INDEXDB=None, reload=0, wordlist=None): + INDEXDB = INDEXDB or self.indexdb + import shelve + self.words = shelve.open(INDEXDB+".WORDS") + self.files = shelve.open(INDEXDB+".FILES") + self.fileids = shelve.open(INDEXDB+".FILEIDS") + if not FILES: # New index + self.files['_TOP'] = (0,None) + + def save_index(self, INDEXDB=None): + INDEXDB = INDEXDB or self.indexdb + pass + +class FlatIndexer(GenericIndexer, TextSplitter): + """Concrete Indexer utilizing flat-file for storage + + See the comments in the referenced article for details; in + brief, this indexer has about the same timing as the best in + -creating- indexes and the storage requirements are + reasonable. However, actually -using- a flat-file index is + more than an order of magnitude worse than the best indexer + (ZPickleIndexer wins overall). + + On the other hand, FlatIndexer creates a wonderfully easy to + parse database format if you have a reason to transport the + index to a different platform or programming language. And + should you perform indexing as part of a long-running + process, the overhead of initial file parsing becomes + irrelevant. + """ + def load_index(self, INDEXDB=None, reload=0, wordlist=None): + # Unless reload is indicated, do not load twice + if self.index_loaded() and not reload: return 0 + # Ok, now let's actually load it + INDEXDB = INDEXDB or self.indexdb + self.words = {} + self.files = {'_TOP':(0,None)} + self.fileids = {} + try: # Read index contents + for line in open(INDEXDB).readlines(): + fields = string.split(line) + if fields[0] == '-': # Read a file/fileid line + fileid = eval(fields[2]) + wordcount = eval(fields[3]) + fname = fields[1] + self.files[fname] = (fileid, wordcount) + self.fileids[fileid] = fname + else: # Read a word entry (dict of hits) + entries = {} + word = fields[0] + for n in range(1,len(fields),2): + fileid = eval(fields[n]) + occurs = eval(fields[n+1]) + entries[fileid] = occurs + self.words[word] = entries + except: + pass # New index + + def save_index(self, INDEXDB=None): + INDEXDB = INDEXDB or self.indexdb + tab, lf, sp = '\t','\n',' ' + indexdb = open(INDEXDB,'w') + for fname,entry in self.files.items(): + indexdb.write('- '+fname +tab +`entry[0]` +tab +`entry[1]` +lf) + for word,entry in self.words.items(): + indexdb.write(word +tab+tab) + for fileid,occurs in entry.items(): + indexdb.write(`fileid` +sp +`occurs` +sp) + indexdb.write(lf) + +class PickleIndexer(GenericIndexer, TextSplitter): + def load_index(self, INDEXDB=None, reload=0, wordlist=None): + # Unless reload is indicated, do not load twice + if self.index_loaded() and not reload: return 0 + # Ok, now let's actually load it + import cPickle + INDEXDB = INDEXDB or self.indexdb + try: + pickle_str = open(INDEXDB,'rb').read() + db = cPickle.loads(pickle_str) + except: # New index + db = Index({}, {'_TOP':(0,None)}, {}) + self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS + + def save_index(self, INDEXDB=None): + import cPickle + INDEXDB = INDEXDB or self.indexdb + db = Index(self.words, self.files, self.fileids) + open(INDEXDB,'wb').write(cPickle.dumps(db, 1)) + +class XMLPickleIndexer(PickleIndexer): + """Concrete Indexer utilizing XML for storage + + While this is, as expected, a verbose format, the possibility + of using XML as a transport format for indexes might be + useful. However, [xml_pickle] is in need of some redesign to + avoid gross inefficiency when creating very large + (multi-megabyte) output files (fixed in [xml_pickle] version + 0.48 or above) + """ + def load_index(self, INDEXDB=None, reload=0, wordlist=None): + # Unless reload is indicated, do not load twice + if self.index_loaded() and not reload: return 0 + # Ok, now let's actually load it + from gnosis.xml.pickle import XML_Pickler + INDEXDB = INDEXDB or self.indexdb + try: # XML file exists + xml_str = open(INDEXDB).read() + db = XML_Pickler().loads(xml_str) + except: # New index + db = Index({}, {'_TOP':(0,None)}, {}) + self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS + + def save_index(self, INDEXDB=None): + from gnosis.xml.pickle import XML_Pickler + INDEXDB = INDEXDB or self.indexdb + db = Index(self.words, self.files, self.fileids) + open(INDEXDB,'w').write(XML_Pickler(db).dumps()) + +class ZPickleIndexer(PickleIndexer): + def load_index(self, INDEXDB=None, reload=0, wordlist=None): + # Unless reload is indicated, do not load twice + if self.index_loaded() and not reload: return 0 + # Ok, now let's actually load it + import cPickle, zlib + INDEXDB = INDEXDB or self.indexdb + try: + pickle_str = zlib.decompress(open(INDEXDB+'!','rb').read()) + db = cPickle.loads(pickle_str) + except: # New index + db = Index({}, {'_TOP':(0,None)}, {}) + self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS + + def save_index(self, INDEXDB=None): + import cPickle, zlib + INDEXDB = INDEXDB or self.indexdb + db = Index(self.words, self.files, self.fileids) + pickle_fh = open(INDEXDB+'!','wb') + pickle_fh.write(zlib.compress(cPickle.dumps(db, 1))) + + +class SlicedZPickleIndexer(ZPickleIndexer): + segments = "ABCDEFGHIJKLMNOPQRSTUVWXYZ#-!" + def load_index(self, INDEXDB=None, reload=0, wordlist=None): + # Unless reload is indicated, do not load twice + if self.index_loaded() and not reload: return 0 + # Ok, now let's actually load it + import cPickle, zlib + INDEXDB = INDEXDB or self.indexdb + db = Index({}, {'_TOP':(0,None)}, {}) + # Identify the relevant word-dictionary segments + if not wordlist: + segments = self.segments + else: + segments = ['-','#'] + for word in wordlist: + segments.append(string.upper(word[0])) + # Load the segments + for segment in segments: + try: + pickle_str = zlib.decompress(open(INDEXDB+segment,'rb').read()) + dbslice = cPickle.loads(pickle_str) + if dbslice.__dict__.get('WORDS'): # If it has some words, add them + for word,entry in dbslice.WORDS.items(): + db.WORDS[word] = entry + if dbslice.__dict__.get('FILES'): # If it has some files, add them + db.FILES = dbslice.FILES + if dbslice.__dict__.get('FILEIDS'): # If it has fileids, add them + db.FILEIDS = dbslice.FILEIDS + except: + pass # No biggie, couldn't find this segment + self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS + + def julienne(self, INDEXDB=None): + import cPickle, zlib + INDEXDB = INDEXDB or self.indexdb + segments = self.segments # all the (little) indexes + for segment in segments: + try: # brutal space saver... delete all the small segments + os.remove(INDEXDB+segment) + except OSError: + pass # probably just nonexistent segment index file + # First write the much simpler filename/fileid dictionaries + dbfil = Index(None, self.files, self.fileids) + open(INDEXDB+'-','wb').write(zlib.compress(cPickle.dumps(dbfil,1))) + # The hard part is splitting the word dictionary up, of course + letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + segdicts = {} # Need batch of empty dicts + for segment in letters+'#': + segdicts[segment] = {} + for word, entry in self.words.items(): # Split into segment dicts + initchar = string.upper(word[0]) + if initchar in letters: + segdicts[initchar][word] = entry + else: + segdicts['#'][word] = entry + for initchar in letters+'#': + db = Index(segdicts[initchar], None, None) + pickle_str = cPickle.dumps(db, 1) + filename = INDEXDB+initchar + pickle_fh = open(filename,'wb') + pickle_fh.write(zlib.compress(pickle_str)) + os.chmod(filename,0664) + + save_index = julienne + +PreferredIndexer = SlicedZPickleIndexer + +#-- If called from command-line, parse arguments and take actions +if __name__ == '__main__': + import time + start = time.time() + search_words = [] # Word search list (if specified) + opts = 0 # Any options specified? + if len(sys.argv) < 2: + pass # No options given + else: + upper = string.upper + dir = os.getcwd() # Default to indexing from current directory + descend = 1 # Default to recursive indexing + ndx = PreferredIndexer() + for opt in sys.argv[1:]: + if opt in ('-h','/h','-?','/?','?','--help'): # help screen + print __shell_usage__ + opts = -1 + break + elif opt[0] in '/-': # a switch! + opts = opts+1 + if upper(opt[1:]) == 'INDEX': # Index files + ndx.quiet = 0 + pass # Use defaults if no other options + elif upper(opt[1:]) == 'REINDEX': # Reindex + ndx.reindex = 1 + elif upper(opt[1:]) == 'CASESENSITIVE': # Case sensitive + ndx.casesensitive = 1 + elif upper(opt[1:]) in ('NORECURSE','LOCAL'): # No recursion + descend = 0 + elif upper(opt[1:4]) == 'DIR': # Dir to index + dir = opt[5:] + elif upper(opt[1:8]) == 'INDEXDB': # Index specified + ndx.indexdb = opt[9:] + sys.stderr.write( + "Use of INDEXER_DB environment variable is STRONGLY recommended.\n") + elif upper(opt[1:6]) == 'REGEX': # RegEx files to index + ndx.add_pattern = re.compile(opt[7:]) + elif upper(opt[1:5]) == 'GLOB': # Glob files to index + ndx.add_pattern = opt[6:] + elif upper(opt[1:7]) in ('OUTPUT','FORMAT'): # How should results look? + opts = opts-1 # this is not an option for indexing purposes + level = upper(opt[8:]) + if level in ('ALL','EVERYTHING','VERBOSE', 'MAX'): + ndx.quiet = 0 + elif level in ('RATINGS','SCORES','HIGH'): + ndx.quiet = 3 + elif level in ('FILENAMES','NAMES','FILES','MID'): + ndx.quiet = 5 + elif level in ('SUMMARY','MIN'): + ndx.quiet = 9 + elif upper(opt[1:7]) == 'FILTER': # Regex filter output + opts = opts-1 # this is not an option for indexing purposes + ndx.filter = opt[8:] + elif opt[1:] in string.digits: + opts = opts-1 + ndx.quiet = eval(opt[1]) + else: + search_words.append(opt) # Search words + + if opts > 0: + ndx.add_files(dir=dir) + ndx.save_index() + if search_words: + ndx.find(search_words, print_report=1) + if not opts and not search_words: + sys.stderr.write("Perhaps you would like to use the --help option?\n") + else: + sys.stderr.write('Processed in %.3f seconds (%s)' + % (time.time()-start, ndx.whoami())) + +# +#$Log: not supported by cvs2svn $ +#Revision 1.1.2.3 2002/04/03 12:05:15 rochecompaan +#Removed dos control characters. +# +#Revision 1.1.2.2 2002/04/03 12:01:55 rochecompaan +#Oops. Forgot to include cvs keywords in file. +# diff --git a/roundup/roundup_indexer.py b/roundup/roundup_indexer.py new file mode 100644 index 0000000..a66261c --- /dev/null +++ b/roundup/roundup_indexer.py @@ -0,0 +1,98 @@ +# +# Copyright (c) 2001 Bizar Software Pty Ltd (http://www.bizarsoftware.com.au/) +# This module is free software, and you may redistribute it and/or modify +# under the same terms as Python, so long as this copyright message and +# disclaimer are retained in their original form. +# +# IN NO EVENT SHALL BIZAR SOFTWARE PTY LTD BE LIABLE TO ANY PARTY FOR +# DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING +# OUT OF THE USE OF THIS CODE, EVEN IF THE AUTHOR HAS BEEN ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# BIZAR SOFTWARE PTY LTD SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, +# BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE. THE CODE PROVIDED HEREUNDER IS ON AN "AS IS" +# BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE, +# SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. +# +#$Id: roundup_indexer.py,v 1.2 2002-05-25 07:16:24 rochecompaan Exp $ +''' +This module provides an indexer class, RoundupIndexer, that stores text +indices in a roundup instance. This class makes searching the content of +messages and text files possible. +''' +import os +from roundup.indexer import SlicedZPickleIndexer + +class RoundupIndexer(SlicedZPickleIndexer): + ''' Indexes messages and files + ''' + + def __init__(self, db_path): + indexdb_path = os.path.join(db_path, 'indexes') + index_exists = 0 + if not os.path.exists(indexdb_path): + os.makedirs(indexdb_path) + os.chmod(indexdb_path, 0775) + else: + index_exists = 1 + index_path = os.path.join(indexdb_path, 'index.db') + SlicedZPickleIndexer.__init__(self, + INDEXDB=index_path, QUIET=9) + files_path = os.path.join(db_path, 'files') + if not index_exists: + self.add_files(dir=files_path) + self.save_index() + + def search(self, search_terms, klass): + ''' display search results + ''' + hits = self.find(search_terms) + links = [] + nodeids = {} + designator_propname = {'msg': 'messages', + 'file': 'files'} + if hits: + hitcount = len(hits) + # build a dictionary of nodes and their associated messages + # and files + for hit in hits.keys(): + filename = hits[hit].split('/')[-1] + for designator, propname in designator_propname.items(): + if filename.find(designator) == -1: continue + nodeid = filename[len(designator):] + result = apply(klass.find, (), {propname:nodeid}) + if not result: continue + + id = str(result[0]) + if not nodeids.has_key(id): + nodeids[id] = {} + + node_dict = nodeids[id] + if not node_dict.has_key(propname): + node_dict[propname] = [nodeid] + elif node_dict.has_key(propname): + node_dict[propname].append(nodeid) + + return nodeids + + +# +#$Log: not supported by cvs2svn $ +#Revision 1.1.2.3 2002/05/02 11:52:12 rochecompaan +#Fixed small bug that prevented indexes from being generated. +# +#Revision 1.1.2.2 2002/04/19 19:54:42 rochecompaan +#cgi_client.py +# removed search link for the time being +# moved rendering of matches to htmltemplate +#hyperdb.py +# filtering of nodes on full text search incorporated in filter method +#roundupdb.py +# added paramater to call of filter method +#roundup_indexer.py +# added search method to RoundupIndexer class +# +#Revision 1.1.2.1 2002/04/03 11:55:57 rochecompaan +# . Added feature #526730 - search for messages capability +# diff --git a/roundup/roundupdb.py b/roundup/roundupdb.py index f870bb9..f874db8 100644 --- a/roundup/roundupdb.py +++ b/roundup/roundupdb.py @@ -15,7 +15,7 @@ # BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE, # SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. # -# $Id: roundupdb.py,v 1.52 2002-05-15 03:27:16 richard Exp $ +# $Id: roundupdb.py,v 1.53 2002-05-25 07:16:24 rochecompaan Exp $ __doc__ = """ Extending hyperdb with types specific to issue-tracking. @@ -80,7 +80,7 @@ class Database: # try the user alternate addresses if possible props = self.user.getprops() if props.has_key('alternate_addresses'): - users = self.user.filter({'alternate_addresses': address}, + users = self.user.filter(None, {'alternate_addresses': address}, [], []) user = extractUserFromList(self.user, users) if user is not None: return user @@ -625,6 +625,15 @@ class IssueClass(Class): # # $Log: not supported by cvs2svn $ +# Revision 1.52 2002/05/15 03:27:16 richard +# . fixed SCRIPT_NAME in ZRoundup for instances not at top level of Zope +# (thanks dman) +# . fixed some sorting issues that were breaking some unit tests under py2.2 +# . mailgw test output dir was confusing the init test (but only on 2.2 *shrug*) +# +# fixed bug in the init unit test that meant only the bsddb test ran if it +# could (it clobbered the anydbm test) +# # Revision 1.51 2002/04/08 03:46:42 richard # make it work # @@ -637,6 +646,17 @@ class IssueClass(Class): # The initial detector is one that we'll be using here at ekit - it bounces new # issue messages to a team address. # +# Revision 1.49.2.1 2002/04/19 19:54:42 rochecompaan +# cgi_client.py +# removed search link for the time being +# moved rendering of matches to htmltemplate +# hyperdb.py +# filtering of nodes on full text search incorporated in filter method +# roundupdb.py +# added paramater to call of filter method +# roundup_indexer.py +# added search method to RoundupIndexer class +# # Revision 1.49 2002/03/19 06:41:49 richard # Faster, easier, less mess ;) # diff --git a/roundup/templates/classic/instance_config.py b/roundup/templates/classic/instance_config.py index 0a599ae..cd06f42 100644 --- a/roundup/templates/classic/instance_config.py +++ b/roundup/templates/classic/instance_config.py @@ -15,7 +15,7 @@ # BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE, # SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. # -# $Id: instance_config.py,v 1.17 2002-05-22 00:32:33 richard Exp $ +# $Id: instance_config.py,v 1.18 2002-05-25 07:16:25 rochecompaan Exp $ MAIL_DOMAIN=MAILHOST=HTTP_HOST=None HTTP_PORT=0 @@ -117,6 +117,12 @@ HEADER_INDEX_LINKS = ['DEFAULT', 'UNASSIGNED', 'USER'] # list the classes that users are able to add nodes to HEADER_ADD_LINKS = ['issue'] +# list the classes that users can search +HEADER_SEARCH_LINKS = ['issue'] + +# list search filters per class +SEARCH_FILTERS = ['ISSUE_FILTER', 'SUPPORT_FILTER'] + # Now the DEFAULT display specification. TODO: describe format DEFAULT_INDEX = { 'LABEL': 'All Issues', @@ -159,9 +165,28 @@ USER_INDEX = { }, } +ISSUE_FILTER = { + 'CLASS': 'issue', + 'FILTER': ['status', 'priority', 'assignedto', 'creator'] +} + +SUPPORT_FILTER = { + 'CLASS': 'issue', + 'FILTER': ['status', 'priority', 'assignedto', 'creator'] +} + # # $Log: not supported by cvs2svn $ +# Revision 1.17 2002/05/22 00:32:33 richard +# . changed the default message list in issues to display the message body +# . made backends.__init__ be more specific about which ImportErrors it really +# wants to ignore +# . fixed the example addresses in the templates to use correct example domains +# . cleaned out the template stylesheets, removing a bunch of junk that really +# wasn't necessary (font specs, styles never used) and added a style for +# message content +# # Revision 1.16 2002/05/21 06:05:54 richard # . #551483 ] assignedto in Client.make_index_link # @@ -178,6 +203,15 @@ USER_INDEX = { # the config variables EMAIL_KEEP_QUOTED_TEST and # EMAIL_LEAVE_BODY_UNCHANGED. # +# Revision 1.13.2.2 2002/05/02 11:49:19 rochecompaan +# Allow customization of the search filters that should be displayed +# on the search page. +# +# Revision 1.13.2.1 2002/04/20 13:23:33 rochecompaan +# We now have a separate search page for nodes. Search links for +# different classes can be customized in instance_config similar to +# index links. +# # Revision 1.13 2002/03/14 23:59:24 richard # . #517734 ] web header customisation is obscure # diff --git a/roundup/templates/extended/instance_config.py b/roundup/templates/extended/instance_config.py index b10c039..ec6a3bf 100644 --- a/roundup/templates/extended/instance_config.py +++ b/roundup/templates/extended/instance_config.py @@ -15,7 +15,7 @@ # BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE, # SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. # -# $Id: instance_config.py,v 1.17 2002-05-22 00:32:34 richard Exp $ +# $Id: instance_config.py,v 1.18 2002-05-25 07:16:25 rochecompaan Exp $ MAIL_DOMAIN=MAILHOST=HTTP_HOST=None HTTP_PORT=0 @@ -118,6 +118,11 @@ HEADER_INDEX_LINKS = ['DEFAULT', 'ALL_SUPPORT', 'UNASSIGNED_ISSUE', # list the classes that users are able to add nodes to HEADER_ADD_LINKS = ['issue', 'support'] +# list the classes that users can search +HEADER_SEARCH_LINKS = ['issue', 'support'] + +SEARCH_FILTERS = ['ISSUE_FILTER', 'SUPPORT_FILTER'] + # Now the DEFAULT display specifications. TODO: describe format DEFAULT_INDEX = { 'LABEL': 'All Issues', @@ -197,8 +202,27 @@ MY_SUPPORT_INDEX = { }, } +ISSUE_FILTER = { + 'CLASS': 'issue', + 'FILTER': ['status', 'priority', 'assignedto', 'creator'] +} + +SUPPORT_FILTER = { + 'CLASS': 'issue', + 'FILTER': ['status', 'priority', 'assignedto', 'creator'] +} + # # $Log: not supported by cvs2svn $ +# Revision 1.17 2002/05/22 00:32:34 richard +# . changed the default message list in issues to display the message body +# . made backends.__init__ be more specific about which ImportErrors it really +# wants to ignore +# . fixed the example addresses in the templates to use correct example domains +# . cleaned out the template stylesheets, removing a bunch of junk that really +# wasn't necessary (font specs, styles never used) and added a style for +# message content +# # Revision 1.16 2002/05/21 06:05:54 richard # . #551483 ] assignedto in Client.make_index_link # @@ -215,6 +239,15 @@ MY_SUPPORT_INDEX = { # the config variables EMAIL_KEEP_QUOTED_TEST and # EMAIL_LEAVE_BODY_UNCHANGED. # +# Revision 1.13.2.2 2002/05/02 11:49:19 rochecompaan +# Allow customization of the search filters that should be displayed +# on the search page. +# +# Revision 1.13.2.1 2002/04/20 13:23:34 rochecompaan +# We now have a separate search page for nodes. Search links for +# different classes can be customized in instance_config similar to +# index links. +# # Revision 1.13 2002/03/14 23:59:24 richard # . #517734 ] web header customisation is obscure # diff --git a/run_tests b/run_tests index 032b7c9..54eb006 100755 --- a/run_tests +++ b/run_tests @@ -1,4 +1,4 @@ -#! /usr/bin/env python +#! /usr/bin/env python2.2 # # Copyright (c) 2001 Richard Jones # This module is free software, and you may redistribute it and/or modify @@ -9,7 +9,7 @@ # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # -# $Id: run_tests,v 1.4 2002-02-14 23:38:12 richard Exp $ +# $Id: run_tests,v 1.5 2002-05-25 07:16:23 rochecompaan Exp $ from test import go import sys @@ -20,6 +20,13 @@ else: # # $Log: not supported by cvs2svn $ +# Revision 1.4 2002/02/14 23:38:12 richard +# Fixed the unit tests for the mailgw re: the x-roundup-name header. +# Also made the test runner more user-friendly: +# ./run_tests - detect all tests in test/test_.py and run them +# ./run_tests - run only test/test_.py +# eg ./run_tests mailgw - run the mailgw test from test/test_mailgw.py +# # Revision 1.3 2002/01/23 20:09:41 jhermann # Proper fix for failing test # -- 2.30.2