summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (parent: f04ca7e)
raw | patch | inline | side by side (parent: f04ca7e)
author | rochecompaan <rochecompaan@57a73879-2fb5-44c3-a270-3262357dd7e2> | |
Sat, 25 May 2002 07:16:25 +0000 (07:16 +0000) | ||
committer | rochecompaan <rochecompaan@57a73879-2fb5-44c3-a270-3262357dd7e2> | |
Sat, 25 May 2002 07:16:25 +0000 (07:16 +0000) |
git-svn-id: http://svn.roundup-tracker.org/svnroot/roundup/trunk@762 57a73879-2fb5-44c3-a270-3262357dd7e2
CHANGES.txt | patch | blob | history | |
roundup/backends/back_anydbm.py | patch | blob | history | |
roundup/cgi_client.py | patch | blob | history | |
roundup/htmltemplate.py | patch | blob | history | |
roundup/hyperdb.py | patch | blob | history | |
roundup/indexer.py | [new file with mode: 0644] | patch | blob |
roundup/roundup_indexer.py | [new file with mode: 0644] | patch | blob |
roundup/roundupdb.py | patch | blob | history | |
roundup/templates/classic/instance_config.py | patch | blob | history | |
roundup/templates/extended/instance_config.py | patch | blob | history | |
run_tests | patch | blob | history |
diff --git a/CHANGES.txt b/CHANGES.txt
index 51d1297d7ff10149016c7fa043b03720d3bf5436..ec62d75cef6a1d634dd4f36e3507c6d2a237d456 100644 (file)
--- a/CHANGES.txt
+++ b/CHANGES.txt
. applied patch #558876 ] cgi client customization
. split instance initialisation into two steps, allowing config changes
before the database is initialised.
+ . #526730 ] search for messages capability
Fixed:
. stop sending blank (whitespace-only) notes
index 0ee8fbbcf65776e67f64c88ec6e12f2097a8f3a1..5ea64da8956f9f76bf1daccdd1f7cc0bbaf3c4cf 100644 (file)
# BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE,
# SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
#
-#$Id: back_anydbm.py,v 1.34 2002-05-15 06:21:21 richard Exp $
+#$Id: back_anydbm.py,v 1.35 2002-05-25 07:16:24 rochecompaan Exp $
'''
This module defines a backend that saves the hyperdatabase in a database
chosen by anydbm. It is guaranteed to always be available in python
import whichdb, anydbm, os, marshal
from roundup import hyperdb, date
from blobfiles import FileStorage
+from roundup.roundup_indexer import RoundupIndexer
from locking import acquire_lock, release_lock
#
self.dirtynodes = {} # keep track of the dirty nodes by class
self.newnodes = {} # keep track of the new nodes by class
self.transactions = []
+ self.indexer = RoundupIndexer(self.dir)
# ensure files are group readable and writable
os.umask(0002)
def _doStoreFile(self, name, **databases):
# the file is currently ".tmp" - move it to its real name to commit
os.rename(name+".tmp", name)
+ pattern = name.split('/')[-1]
+ self.indexer.add_files(dir=os.path.dirname(name), pattern=pattern)
+ self.indexer.save_index()
def rollback(self):
''' Reverse all actions from the current transaction.
#
#$Log: not supported by cvs2svn $
+#Revision 1.34 2002/05/15 06:21:21 richard
+# . node caching now works, and gives a small boost in performance
+#
+#As a part of this, I cleaned up the DEBUG output and implemented TRACE
+#output (HYPERDBTRACE='file to trace to') with checkpoints at the start of
+#CGI requests. Run roundup with python -O to skip all the DEBUG/TRACE stuff
+#(using if __debug__ which is compiled out with -O)
+#
#Revision 1.33 2002/04/24 10:38:26 rochecompaan
#All database files are now created group readable and writable.
#
#
#Unit tests for all of the above written.
#
+#Revision 1.30.2.1 2002/04/03 11:55:57 rochecompaan
+# . Added feature #526730 - search for messages capability
+#
#Revision 1.30 2002/02/27 03:40:59 richard
#Ran it through pychecker, made fixes
#
diff --git a/roundup/cgi_client.py b/roundup/cgi_client.py
index fcefab93ebe2ff7c09d708446cd13f6ebfa0ce0a..8fb4139db0fef69117a94675c1dda625d705a0a5 100644 (file)
--- a/roundup/cgi_client.py
+++ b/roundup/cgi_client.py
# BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE,
# SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
#
-# $Id: cgi_client.py,v 1.124 2002-05-24 02:09:24 richard Exp $
+# $Id: cgi_client.py,v 1.125 2002-05-25 07:16:24 rochecompaan Exp $
__doc__ = """
WWW request handler (also used in the stand-alone server).
import roundupdb, htmltemplate, date, hyperdb, password
from roundup.i18n import _
+from roundup_indexer import RoundupIndexer
class Unauthorised(ValueError):
pass
except ValueError:
# someone gave us a non-int debug level, turn it off
self.debug = 0
+ self.indexer = RoundupIndexer('%s/db'%instance.INSTANCE_HOME)
def getuid(self):
return self.db.user.lookup(self.user)
links.append(_('<a href="user">User List</a>'))
links.append(_('<a href="newuser">Add User</a>'))
+ # add the search links
+ if hasattr(self.instance, 'HEADER_SEARCH_LINKS'):
+ classes = self.instance.HEADER_SEARCH_LINKS
+ else:
+ classes = ['issue']
+ l = []
+ for class_name in classes:
+ cap_class = class_name.capitalize()
+ links.append(_('Search <a href="search%(class_name)s">'
+ '%(cap_class)s</a>')%locals())
+
# now we have all the links, join 'em
links = '\n | '.join(links)
default_index_columns = ['id','activity','title','status','assignedto']
default_index_filterspec = {'status': ['1', '2', '3', '4', '5', '6', '7']}
- def index(self):
- ''' put up an index - no class specified
- '''
+ def _get_customisation_info(self):
# see if the web has supplied us with any customisation info
defaults = 1
for key in ':sort', ':group', ':filter', ':columns':
# make list() extract the info from the CGI environ
self.classname = 'issue'
sort = group = filter = columns = filterspec = None
+ return columns, filter, group, sort, filterspec
+
+ def index(self):
+ ''' put up an index - no class specified
+ '''
+ columns, filter, group, sort, filterspec = \
+ self._get_customisation_info()
return self.list(columns=columns, filter=filter, group=group,
sort=sort, filterspec=filterspec)
+ def searchnode(self):
+ columns, filter, group, sort, filterspec = \
+ self._get_customisation_info()
+ show_nodes = 1
+ if len(self.form.keys()) == 0:
+ # get the default search filters from instance_config
+ if hasattr(self.instance, 'SEARCH_FILTERS'):
+ for f in self.instance.SEARCH_FILTERS:
+ spec = getattr(self.instance, f)
+ if spec['CLASS'] == self.classname:
+ filter = spec['FILTER']
+
+ show_nodes = 0
+ show_customization = 1
+ return self.list(columns=columns, filter=filter, group=group,
+ sort=sort, filterspec=filterspec,
+ show_customization=show_customization, show_nodes=show_nodes)
+
+
# XXX deviates from spec - loses the '+' (that's a reserved character
# in URLS
def list(self, sort=None, group=None, filter=None, columns=None,
- filterspec=None, show_customization=None):
+ filterspec=None, show_customization=None, show_nodes=1):
''' call the template index with the args
:sort - sort by prop name, optionally preceeded with '-'
if filterspec is None: filterspec = self.index_filterspec(filter)
if show_customization is None:
show_customization = self.customization_widget()
+ if self.form.has_key('search_text'):
+ search_text = self.form['search_text'].value
+ else:
+ search_text = ''
index = htmltemplate.IndexTemplate(self, self.instance.TEMPLATES, cn)
try:
- index.render(filterspec, filter, columns, sort, group,
- show_customization=show_customization)
+ index.render(filterspec, search_text, filter, columns, sort,
+ group, show_customization=show_customization,
+ show_nodes=show_nodes)
except htmltemplate.MissingTemplateError:
self.basicClassEditPage()
self.pagefoot()
self.pagefoot()
showissue = shownode
showmsg = shownode
+ searchissue = searchnode
def _add_author_to_nosy(self, props):
''' add the author value from the props to the nosy list
self.db.commit()
def do_action(self, action, dre=re.compile(r'([^\d]+)(\d+)'),
- nre=re.compile(r'new(\w+)')):
+ nre=re.compile(r'new(\w+)'), sre=re.compile(r'search(\w+)')):
'''Figure the user's action and do it.
'''
# here be the "normal" functionality
func()
return
+ # see if we're to put up the new node page
+ m = sre.match(action)
+ if m:
+ self.classname = m.group(1)
+ try:
+ func = getattr(self, 'search%s'%self.classname)
+ except AttributeError:
+ raise NotFound
+ func()
+ return
+
# otherwise, display the named class
self.classname = action
try:
showtimelog = Client.shownode
newsupport = Client.newnode
newtimelog = Client.newnode
+ searchsupport = Client.searchnode
default_index_sort = ['-activity']
default_index_group = ['priority']
#
# $Log: not supported by cvs2svn $
+# Revision 1.124 2002/05/24 02:09:24 richard
+# Nothing like a live demo to show up the bugs ;)
+#
# Revision 1.123 2002/05/22 05:04:13 richard
# Oops
#
# Revision 1.115 2002/04/02 01:56:10 richard
# . stop sending blank (whitespace-only) notes
#
+# Revision 1.114.2.4 2002/05/02 11:49:18 rochecompaan
+# Allow customization of the search filters that should be displayed
+# on the search page.
+#
+# Revision 1.114.2.3 2002/04/20 13:23:31 rochecompaan
+# We now have a separate search page for nodes. Search links for
+# different classes can be customized in instance_config similar to
+# index links.
+#
+# Revision 1.114.2.2 2002/04/19 19:54:42 rochecompaan
+# cgi_client.py
+# removed search link for the time being
+# moved rendering of matches to htmltemplate
+# hyperdb.py
+# filtering of nodes on full text search incorporated in filter method
+# roundupdb.py
+# added paramater to call of filter method
+# roundup_indexer.py
+# added search method to RoundupIndexer class
+#
+# Revision 1.114.2.1 2002/04/03 11:55:57 rochecompaan
+# . Added feature #526730 - search for messages capability
+#
# Revision 1.114 2002/03/17 23:06:05 richard
# oops
#
index 2f6557a1fdb0abba2b3e4d660048c9cfdc2e923b..b5b998761308af78fa2fafa6e05d3b16c12abf3b 100644 (file)
--- a/roundup/htmltemplate.py
+++ b/roundup/htmltemplate.py
# BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE,
# SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
#
-# $Id: htmltemplate.py,v 1.89 2002-05-15 06:34:47 richard Exp $
+# $Id: htmltemplate.py,v 1.90 2002-05-25 07:16:24 rochecompaan Exp $
__doc__ = """
Template engine.
def go(self, text):
return self.replace.sub(self, text)
- def __call__(self, m, filter=None, columns=None, sort=None, group=None):
+ def __call__(self, m, search_text=None, filter=None, columns=None,
+ sort=None, group=None):
if m.group('name'):
if m.group('name') in self.props:
text = m.group('text')
self.properties = self.cl.getprops()
col_re=re.compile(r'<property\s+name="([^>]+)">')
- def render(self, filterspec={}, filter=[], columns=[], sort=[], group=[],
- show_display_form=1, nodeids=None, show_customization=1):
+ def render(self, filterspec={}, search_text='', filter=[], columns=[],
+ sort=[], group=[], show_display_form=1, nodeids=None,
+ show_customization=1, show_nodes=1):
self.filterspec = filterspec
w = self.client.write
if (show_display_form and
self.instance.FILTER_POSITION in ('top and bottom', 'top')):
w('<form onSubmit="return submit_once()" action="%s">\n'%self.classname)
- self.filter_section(filter_template, filter, columns, group,
- all_filters, all_columns, show_customization)
+ self.filter_section(filter_template, search_text, filter,
+ columns, group, all_filters, all_columns, show_customization)
# make sure that the sorting doesn't get lost either
if sort:
w('<input type="hidden" name=":sort" value="%s">'%
# now actually loop through all the nodes we get from the filter and
# apply the template
- if nodeids is None:
- nodeids = self.cl.filter(filterspec, sort, group)
- for nodeid in nodeids:
- # check for a group heading
- if group_names:
- this_group = [self.cl.get(nodeid, name, _('[no value]'))
- for name in group_names]
- if this_group != old_group:
- l = []
- for name in group_names:
- prop = self.properties[name]
- if isinstance(prop, hyperdb.Link):
- group_cl = self.db.classes[prop.classname]
- key = group_cl.getkey()
- value = self.cl.get(nodeid, name)
- if value is None:
- l.append(_('[unselected %(classname)s]')%{
- 'classname': prop.classname})
- else:
- l.append(group_cl.get(self.cl.get(nodeid,
- name), key))
- elif isinstance(prop, hyperdb.Multilink):
- group_cl = self.db.classes[prop.classname]
- key = group_cl.getkey()
- for value in self.cl.get(nodeid, name):
- l.append(group_cl.get(value, key))
- else:
- value = self.cl.get(nodeid, name, _('[no value]'))
- if value is None:
- value = _('[empty %(name)s]')%locals()
+ if show_nodes:
+ matches = None
+ if nodeids is None:
+ if search_text != '':
+ matches = self.client.indexer.search(
+ search_text.split(' '), self.cl)
+ nodeids = self.cl.filter(matches, filterspec, sort, group)
+ for nodeid in nodeids:
+ # check for a group heading
+ if group_names:
+ this_group = [self.cl.get(nodeid, name, _('[no value]'))
+ for name in group_names]
+ if this_group != old_group:
+ l = []
+ for name in group_names:
+ prop = self.properties[name]
+ if isinstance(prop, hyperdb.Link):
+ group_cl = self.db.classes[prop.classname]
+ key = group_cl.getkey()
+ value = self.cl.get(nodeid, name)
+ if value is None:
+ l.append(_('[unselected %(classname)s]')%{
+ 'classname': prop.classname})
+ else:
+ l.append(group_cl.get(self.cl.get(nodeid,
+ name), key))
+ elif isinstance(prop, hyperdb.Multilink):
+ group_cl = self.db.classes[prop.classname]
+ key = group_cl.getkey()
+ for value in self.cl.get(nodeid, name):
+ l.append(group_cl.get(value, key))
else:
- value = str(value)
- l.append(value)
- w('<tr class="section-bar">'
- '<td align=middle colspan=%s><strong>%s</strong></td></tr>'%(
- len(columns), ', '.join(l)))
- old_group = this_group
-
- # display this node's row
- replace = IndexTemplateReplace(self.globals, locals(), columns)
- self.nodeid = nodeid
- w(replace.go(template))
- self.nodeid = None
+ value = self.cl.get(nodeid, name,
+ _('[no value]'))
+ if value is None:
+ value = _('[empty %(name)s]')%locals()
+ else:
+ value = str(value)
+ l.append(value)
+ w('<tr class="section-bar">'
+ '<td align=middle colspan=%s>'
+ '<strong>%s</strong></td></tr>'%(
+ len(columns), ', '.join(l)))
+ old_group = this_group
+
+ # display this node's row
+ replace = IndexTemplateReplace(self.globals, locals(), columns)
+ self.nodeid = nodeid
+ w(replace.go(template))
+ if matches:
+ self.node_matches(matches[nodeid], len(columns))
+ self.nodeid = None
w('</table>')
if (show_display_form and hasattr(self.instance, 'FILTER_POSITION') and
self.instance.FILTER_POSITION in ('top and bottom', 'bottom')):
w('<form onSubmit="return submit_once()" action="%s">\n'%self.classname)
- self.filter_section(filter_template, filter, columns, group,
- all_filters, all_columns, show_customization)
+ self.filter_section(filter_template, search_text, filter,
+ columns, group, all_filters, all_columns, show_customization)
# make sure that the sorting doesn't get lost either
if sort:
w('<input type="hidden" name=":sort" value="%s">'%
','.join(sort))
w('</form>\n')
+ def node_matches(self, match, colspan):
+ ''' display the files and messages for a node that matched a
+ full text search
+ '''
+ w = self.client.write
- def filter_section(self, template, filter, columns, group, all_filters,
- all_columns, show_customization):
+ message_links = []
+ file_links = []
+ if match.has_key('messages'):
+ for msgid in match['messages']:
+ k = self.db.msg.labelprop()
+ lab = self.db.msg.get(msgid, k)
+ msgpath = 'msg%s'%msgid
+ message_links.append('<a href="%(msgpath)s">%(lab)s</a>'
+ %locals())
+ w(_('<tr class="row-hilite"><td colspan="%s">'
+ ' Matched messages: %s</td></tr>')%(
+ colspan, ', '.join(message_links)))
+
+ if match.has_key('files'):
+ for fileid in match['files']:
+ filename = self.db.file.get(fileid, 'name')
+ filepath = 'file%s/%s'%(fileid, filename)
+ file_links.append('<a href="%(filepath)s">%(filename)s</a>'
+ %locals())
+ w(_('<tr class="row-hilite"><td colspan="%s">'
+ ' Matched files: %s</td></tr>')%(
+ colspan, ', '.join(file_links)))
+
+
+ def filter_section(self, template, search_text, filter, columns, group,
+ all_filters, all_columns, show_customization):
w = self.client.write
w('<tr class="location-bar">')
w(_(' <th align="left" colspan="2">Filter specification...</th>'))
w('</tr>')
+ w('<tr>')
+ w('<th class="location-bar">Search terms</th>')
+ w('<td><input name="search_text" value="%s" size="50"></td>'%(
+ search_text))
+ w('</tr>')
replace = IndexTemplateReplace(self.globals, locals(), filter)
w(replace.go(template))
w('<tr class="location-bar"><td width="1%%"> </td>')
w(':sort=%s'%','.join(m[:2]))
return '&'.join(l)
+
#
# ITEM TEMPLATES
#
#
# $Log: not supported by cvs2svn $
+# Revision 1.89 2002/05/15 06:34:47 richard
+# forgot to fix the templating for last change
+#
# Revision 1.88 2002/04/24 08:34:35 rochecompaan
# Sorting was applied to all nodes of the MultiLink class instead of
# the nodes that are actually linked to in the "field" template
# text. The link value is displayed as a tooltip using the title anchor
# attribute.
#
+# Revision 1.84.2.2 2002/04/20 13:23:32 rochecompaan
+# We now have a separate search page for nodes. Search links for
+# different classes can be customized in instance_config similar to
+# index links.
+#
+# Revision 1.84.2.1 2002/04/19 19:54:42 rochecompaan
+# cgi_client.py
+# removed search link for the time being
+# moved rendering of matches to htmltemplate
+# hyperdb.py
+# filtering of nodes on full text search incorporated in filter method
+# roundupdb.py
+# added paramater to call of filter method
+# roundup_indexer.py
+# added search method to RoundupIndexer class
+#
# Revision 1.84 2002/03/29 19:41:48 rochecompaan
# . Fixed display of mutlilink properties when using the template
# functions, menu and plain.
diff --git a/roundup/hyperdb.py b/roundup/hyperdb.py
index bf532d2a644673696df63a8ffbd64a7fd2c98f61..776704ca0bdfca7fcf4ebe08398fa3c1a6012e0d 100644 (file)
--- a/roundup/hyperdb.py
+++ b/roundup/hyperdb.py
# BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE,
# SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
#
-# $Id: hyperdb.py,v 1.65 2002-05-22 04:12:05 richard Exp $
+# $Id: hyperdb.py,v 1.66 2002-05-25 07:16:24 rochecompaan Exp $
__doc__ = """
Hyperdatabase implementation, especially field types.
return l
# XXX not in spec
- def filter(self, filterspec, sort, group, num_re = re.compile('^\d+$')):
+ def filter(self, search_matches, filterspec, sort, group,
+ num_re = re.compile('^\d+$')):
''' Return a list of the ids of the active nodes in this class that
match the 'filter' spec, sorted by the group spec and then the
sort spec
l.append((nodeid, node))
l.sort()
+ # filter based on full text search
+ if search_matches is not None:
+ k = []
+ l_debug = []
+ for v in l:
+ l_debug.append(v[0])
+ if search_matches.has_key(v[0]):
+ k.append(v)
+ l = k
+
# optimise sort
m = []
for entry in sort:
#
# $Log: not supported by cvs2svn $
+# Revision 1.65 2002/05/22 04:12:05 richard
+# . applied patch #558876 ] cgi client customization
+# ... with significant additions and modifications ;)
+# - extended handling of ML assignedto to all places it's handled
+# - added more NotFound info
+#
# Revision 1.64 2002/05/15 06:21:21 richard
# . node caching now works, and gives a small boost in performance
#
#
# Unit tests for all of the above written.
#
+# Revision 1.59.2.2 2002/04/20 13:23:33 rochecompaan
+# We now have a separate search page for nodes. Search links for
+# different classes can be customized in instance_config similar to
+# index links.
+#
+# Revision 1.59.2.1 2002/04/19 19:54:42 rochecompaan
+# cgi_client.py
+# removed search link for the time being
+# moved rendering of matches to htmltemplate
+# hyperdb.py
+# filtering of nodes on full text search incorporated in filter method
+# roundupdb.py
+# added paramater to call of filter method
+# roundup_indexer.py
+# added search method to RoundupIndexer class
+#
# Revision 1.59 2002/03/12 22:52:26 richard
# more pychecker warnings removed
#
diff --git a/roundup/indexer.py b/roundup/indexer.py
--- /dev/null
+++ b/roundup/indexer.py
@@ -0,0 +1,790 @@
+#!/usr/bin/env python
+
+"""Create full-text indexes and search them
+
+Notes:
+
+ See http://gnosis.cx/publish/programming/charming_python_15.txt
+ for a detailed discussion of this module.
+
+ This version requires Python 1.6+. It turns out that the use
+ of string methods rather than [string] module functions is
+ enough faster in a tight loop so as to provide a quite
+ remarkable 25% speedup in overall indexing. However, only FOUR
+ lines in TextSplitter.text_splitter() were changed away from
+ Python 1.5 compatibility. Those lines are followed by comments
+ beginning with "# 1.52: " that show the old forms. Python
+ 1.5 users can restore these lines, and comment out those just
+ above them.
+
+Classes:
+
+ GenericIndexer -- Abstract class
+ TextSplitter -- Mixin class
+ Index
+ ShelveIndexer
+ FlatIndexer
+ XMLPickleIndexer
+ PickleIndexer
+ ZPickleIndexer
+ SlicedZPickleIndexer
+
+Functions:
+
+ echo_fname(fname)
+ recurse_files(...)
+
+Index Formats:
+
+ *Indexer.files: filename --> (fileid, wordcount)
+ *Indexer.fileids: fileid --> filename
+ *Indexer.words: word --> {fileid1:occurs, fileid2:occurs, ...}
+
+Module Usage:
+
+ There are a few ways to use this module. Just to utilize existing
+ functionality, something like the following is a likely
+ pattern:
+
+ import gnosis.indexer as indexer
+ index = indexer.MyFavoriteIndexer() # For some concrete Indexer
+ index.load_index('myIndex.db')
+ index.add_files(dir='/this/that/otherdir', pattern='*.txt')
+ hits = index.find(['spam','eggs','bacon'])
+ index.print_report(hits)
+
+ To customize the basic classes, something like the following is likely:
+
+ class MySplitter:
+ def splitter(self, text, ftype):
+ "Peform much better splitting than default (for filetypes)"
+ # ...
+ return words
+
+ class MyIndexer(indexer.GenericIndexer, MySplitter):
+ def load_index(self, INDEXDB=None):
+ "Retrieve three dictionaries from clever storage method"
+ # ...
+ self.words, self.files, self.fileids = WORDS, FILES, FILEIDS
+ def save_index(self, INDEXDB=None):
+ "Save three dictionaries to clever storage method"
+
+ index = MyIndexer()
+ # ...etc...
+
+Benchmarks:
+
+ As we know, there are lies, damn lies, and benchmarks. Take
+ the below with an adequate dose of salt. In version 0.10 of
+ the concrete indexers, some performance was tested. The
+ test case was a set of mail/news archives, that were about
+ 43 mB, and 225 files. In each case, an index was generated
+ (if possible), and a search for the words "xml python" was
+ performed.
+
+ - Index w/ PickleIndexer: 482s, 2.4 mB
+ - Search w/ PickleIndexer: 1.74s
+ - Index w/ ZPickleIndexer: 484s, 1.2 mB
+ - Search w/ ZPickleIndexer: 1.77s
+ - Index w/ FlatIndexer: 492s, 2.6 mB
+ - Search w/ FlatIndexer: 53s
+ - Index w/ ShelveIndexer: (dumbdbm) Many minutes, tens of mBs
+ - Search w/ ShelveIndexer: Aborted before completely indexed
+ - Index w/ ShelveIndexer: (dbhash) Long time (partial crash), 10 mB
+ - Search w/ ShelveIndexer: N/A. Too many glitches
+ - Index w/ XMLPickleIndexer: Memory error (xml_pickle uses bad string
+ composition for large output)
+ - Search w/ XMLPickleIndexer: N/A
+ - grep search (xml|python): 20s (cached: <5s)
+ - 'srch' utility (python): 12s
+"""
+#$Id: indexer.py,v 1.2 2002-05-25 07:16:24 rochecompaan Exp $
+
+__shell_usage__ = """
+Shell Usage: [python] indexer.py [options] [search_words]
+
+ -h, /h, -?, /?, ?, --help: Show this help screen
+ -index: Add files to index
+ -reindex: Refresh files already in the index
+ (can take much more time)
+ -casesensitive: Maintain the case of indexed words
+ (can lead to MUCH larger indices)
+ -norecurse, -local: Only index starting dir, not subdirs
+ -dir=<directory>: Starting directory for indexing
+ (default is current directory)
+ -indexdb=<database>: Use specified index database
+ (environ variable INDEXER_DB is preferred)
+ -regex=<pattern>: Index files matching regular expression
+ -glob=<pattern>: Index files matching glob pattern
+ -filter=<pattern> Only display results matching pattern
+ -output=<op>, -format=<opt>: How much detail on matches?
+ -<digit>: Quiet level (0=verbose ... 9=quiet)
+
+Output/format options are ALL/EVERYTHING/VERBOSE, RATINGS/SCORES,
+FILENAMES/NAMES/FILES, SUMMARY/REPORT"""
+
+__version__ = "$Revision: 1.2 $"
+__author__=["David Mertz (mertz@gnosis.cx)",]
+__thanks_to__=["Pat Knight (p.knight@ktgroup.co.uk)",
+ "Gregory Popovitch (greg@gpy.com)", ]
+__copyright__="""
+ This file is released to the public domain. I (dqm) would
+ appreciate it if you choose to keep derived works under terms
+ that promote freedom, but obviously am giving up any rights
+ to compel such.
+"""
+
+__history__="""
+ 0.1 Initial version.
+
+ 0.11 Tweaked TextSplitter after some random experimentation.
+
+ 0.12 Added SlicedZPickleIndexer (best choice, so far).
+
+ 0.13 Pat Knight pointed out need for binary open()'s of
+ certain files under Windows.
+
+ 0.14 Added '-filter' switch to search results.
+
+ 0.15 Added direct read of gzip files
+
+ 0.20 Gregory Popovitch did some profiling on TextSplitter,
+ and provided both huge speedups to the Python version
+ and hooks to a C extension class (ZopeTextSplitter).
+ A little refactoring by he and I (dqm) has nearly
+ doubled the speed of indexing
+
+ 0.30 Module refactored into gnosis package. This is a
+ first pass, and various documentation and test cases
+ should be added later.
+"""
+import string, re, os, fnmatch, sys, copy, gzip
+from types import *
+
+#-- Silly "do nothing" default recursive file processor
+def echo_fname(fname): print fname
+
+#-- "Recurse and process files" utility function
+def recurse_files(curdir, pattern, exclusions, func=echo_fname, *args, **kw):
+ "Recursively process file pattern"
+ subdirs, files = [],[]
+ level = kw.get('level',0)
+
+ for name in os.listdir(curdir):
+ fname = os.path.join(curdir, name)
+ if name[-4:] in exclusions:
+ pass # do not include binary file type
+ elif os.path.isdir(fname) and not os.path.islink(fname):
+ subdirs.append(fname)
+ # kludge to detect a regular expression across python versions
+ elif sys.version[0]=='1' and isinstance(pattern, re.RegexObject):
+ if pattern.match(name):
+ files.append(fname)
+ elif sys.version[0]=='2' and type(pattern)==type(re.compile('')):
+ if pattern.match(name):
+ files.append(fname)
+ elif type(pattern) is StringType:
+ if fnmatch.fnmatch(name, pattern):
+ files.append(fname)
+
+ for fname in files:
+ apply(func, (fname,)+args)
+ for subdir in subdirs:
+ recurse_files(subdir, pattern, exclusions, func, level=level+1)
+
+#-- Data bundle for index dictionaries
+class Index:
+ def __init__(self, words, files, fileids):
+ if words is not None: self.WORDS = words
+ if files is not None: self.FILES = files
+ if fileids is not None: self.FILEIDS = fileids
+
+#-- "Split plain text into words" utility function
+class TextSplitter:
+ def initSplitter(self):
+ prenum = string.join(map(chr, range(0,48)), '')
+ num2cap = string.join(map(chr, range(58,65)), '')
+ cap2low = string.join(map(chr, range(91,97)), '')
+ postlow = string.join(map(chr, range(123,256)), '')
+ nonword = prenum + num2cap + cap2low + postlow
+ self.word_only = string.maketrans(nonword, " "*len(nonword))
+ self.nondigits = string.join(map(chr, range(0,48)) + map(chr, range(58,255)), '')
+ self.alpha = string.join(map(chr, range(65,91)) + map(chr, range(97,123)), '')
+ self.ident = string.join(map(chr, range(256)), '')
+ self.init = 1
+
+ def splitter(self, text, ftype):
+ "Split the contents of a text string into a list of 'words'"
+ if ftype == 'text/plain':
+ words = self.text_splitter(text, self.casesensitive)
+ else:
+ raise NotImplementedError
+ return words
+
+ def text_splitter(self, text, casesensitive=0):
+ """Split text/plain string into a list of words
+
+ In version 0.20 this function is still fairly weak at
+ identifying "real" words, and excluding gibberish
+ strings. As long as the indexer looks at "real" text
+ files, it does pretty well; but if indexing of binary
+ data is attempted, a lot of gibberish gets indexed.
+ Suggestions on improving this are GREATLY APPRECIATED.
+ """
+ # Initialize some constants
+ if not hasattr(self,'init'): self.initSplitter()
+
+ # Speedup trick: attributes into local scope
+ word_only = self.word_only
+ ident = self.ident
+ alpha = self.alpha
+ nondigits = self.nondigits
+ translate = string.translate
+
+ # Let's adjust case if not case-sensitive
+ if not casesensitive: text = string.upper(text)
+
+ # Split the raw text
+ allwords = string.split(text)
+
+ # Finally, let's skip some words not worth indexing
+ words = []
+ for word in allwords:
+ if len(word) > 25: continue # too long (probably gibberish)
+
+ # Identify common patterns in non-word data (binary, UU/MIME, etc)
+ num_nonalpha = len(word.translate(ident, alpha))
+ numdigits = len(word.translate(ident, nondigits))
+ # 1.52: num_nonalpha = len(translate(word, ident, alpha))
+ # 1.52: numdigits = len(translate(word, ident, nondigits))
+ if numdigits > len(word)-2: # almost all digits
+ if numdigits > 5: # too many digits is gibberish
+ continue # a moderate number is year/zipcode/etc
+ elif num_nonalpha*3 > len(word): # too much scattered nonalpha = gibberish
+ continue
+
+ word = word.translate(word_only) # Let's strip funny byte values
+ # 1.52: word = translate(word, word_only)
+ subwords = word.split() # maybe embedded non-alphanumeric
+ # 1.52: subwords = string.split(word)
+ for subword in subwords: # ...so we might have subwords
+ if len(subword) <= 2: continue # too short a subword
+ words.append(subword)
+ return words
+
+class ZopeTextSplitter:
+ def initSplitter(self):
+ import Splitter
+ stop_words=(
+ 'am', 'ii', 'iii', 'per', 'po', 're', 'a', 'about', 'above', 'across',
+ 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone',
+ 'along', 'already', 'also', 'although', 'always', 'am', 'among',
+ 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any',
+ 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around',
+ 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes',
+ 'becoming', 'been', 'before', 'beforehand', 'behind', 'being',
+ 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both',
+ 'bottom', 'but', 'by', 'can', 'cannot', 'cant', 'con', 'could',
+ 'couldnt', 'cry', 'describe', 'detail', 'do', 'done', 'down', 'due',
+ 'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else',
+ 'elsewhere', 'empty', 'enough', 'even', 'ever', 'every', 'everyone',
+ 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty',
+ 'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly',
+ 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get',
+ 'give', 'go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her',
+ 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers',
+ 'herself', 'him', 'himself', 'his', 'how', 'however', 'hundred', 'i',
+ 'ie', 'if', 'in', 'inc', 'indeed', 'interest', 'into', 'is', 'it',
+ 'its', 'itself', 'keep', 'last', 'latter', 'latterly', 'least',
+ 'less', 'made', 'many', 'may', 'me', 'meanwhile', 'might', 'mill',
+ 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must',
+ 'my', 'myself', 'name', 'namely', 'neither', 'never', 'nevertheless',
+ 'next', 'nine', 'no', 'nobody', 'none', 'noone', 'nor', 'not',
+ 'nothing', 'now', 'nowhere', 'of', 'off', 'often', 'on', 'once',
+ 'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'our',
+ 'ours', 'ourselves', 'out', 'over', 'own', 'per', 'perhaps',
+ 'please', 'pre', 'put', 'rather', 're', 'same', 'see', 'seem',
+ 'seemed', 'seeming', 'seems', 'serious', 'several', 'she', 'should',
+ 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some',
+ 'somehow', 'someone', 'something', 'sometime', 'sometimes',
+ 'somewhere', 'still', 'such', 'take', 'ten', 'than', 'that', 'the',
+ 'their', 'them', 'themselves', 'then', 'thence', 'there',
+ 'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these',
+ 'they', 'thick', 'thin', 'third', 'this', 'those', 'though', 'three',
+ 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too',
+ 'toward', 'towards', 'twelve', 'twenty', 'two', 'un', 'under',
+ 'until', 'up', 'upon', 'us', 'very', 'via', 'was', 'we', 'well',
+ 'were', 'what', 'whatever', 'when', 'whence', 'whenever', 'where',
+ 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon',
+ 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever',
+ 'whole', 'whom', 'whose', 'why', 'will', 'with', 'within', 'without',
+ 'would', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves',
+ )
+ self.stop_word_dict={}
+ for word in stop_words: self.stop_word_dict[word]=None
+ self.splitterobj = Splitter.getSplitter()
+ self.init = 1
+
+ def goodword(self, word):
+ return len(word) < 25
+
+ def splitter(self, text, ftype):
+ """never case-sensitive"""
+ if not hasattr(self,'init'): self.initSplitter()
+ return filter(self.goodword, self.splitterobj(text, self.stop_word_dict))
+
+
+#-- "Abstract" parent class for inherited indexers
+# (does not handle storage in parent, other methods are primitive)
+
+class GenericIndexer:
+ def __init__(self, **kw):
+ apply(self.configure, (), kw)
+
+ def whoami(self):
+ return self.__class__.__name__
+
+ def configure(self, REINDEX=0, CASESENSITIVE=0,
+ INDEXDB=os.environ.get('INDEXER_DB', 'TEMP_NDX.DB'),
+ ADD_PATTERN='*', QUIET=5):
+ "Configure settings used by indexing and storage/retrieval"
+ self.indexdb = INDEXDB
+ self.reindex = REINDEX
+ self.casesensitive = CASESENSITIVE
+ self.add_pattern = ADD_PATTERN
+ self.quiet = QUIET
+ self.filter = None
+
+ def add_files(self, dir=os.getcwd(), pattern=None, descend=1):
+ self.load_index()
+ exclusions = ('.zip','.pyc','.gif','.jpg','.dat','.dir')
+ if not pattern:
+ pattern = self.add_pattern
+ recurse_files(dir, pattern, exclusions, self.add_file)
+ # Rebuild the fileid index
+ self.fileids = {}
+ for fname in self.files.keys():
+ fileid = self.files[fname][0]
+ self.fileids[fileid] = fname
+
+ def add_file(self, fname, ftype='text/plain'):
+ "Index the contents of a regular file"
+ if self.files.has_key(fname): # Is file eligible for (re)indexing?
+ if self.reindex: # Reindexing enabled, cleanup dicts
+ self.purge_entry(fname, self.files, self.words)
+ else: # DO NOT reindex this file
+ if self.quiet < 5: print "Skipping", fname
+ return 0
+
+ # Read in the file (if possible)
+ try:
+ if fname[-3:] == '.gz':
+ text = gzip.open(fname).read()
+ else:
+ text = open(fname).read()
+ if self.quiet < 5: print "Indexing", fname
+ except IOError:
+ return 0
+ words = self.splitter(text, ftype)
+
+ # Find new file index, and assign it to filename
+ # (_TOP uses trick of negative to avoid conflict with file index)
+ self.files['_TOP'] = (self.files['_TOP'][0]-1, None)
+ file_index = abs(self.files['_TOP'][0])
+ self.files[fname] = (file_index, len(words))
+
+ filedict = {}
+ for word in words:
+ if filedict.has_key(word):
+ filedict[word] = filedict[word]+1
+ else:
+ filedict[word] = 1
+
+ for word in filedict.keys():
+ if self.words.has_key(word):
+ entry = self.words[word]
+ else:
+ entry = {}
+ entry[file_index] = filedict[word]
+ self.words[word] = entry
+
+ def add_othertext(self, identifier):
+ """Index a textual source other than a plain file
+
+ A child class might want to implement this method (or a similar one)
+ in order to index textual sources such as SQL tables, URLs, clay
+ tablets, or whatever else. The identifier should uniquely pick out
+ the source of the text (whatever it is)
+ """
+ raise NotImplementedError
+
+ def save_index(self, INDEXDB=None):
+ raise NotImplementedError
+
+ def load_index(self, INDEXDB=None, reload=0, wordlist=None):
+ raise NotImplementedError
+
+ def find(self, wordlist, print_report=0):
+ "Locate files that match ALL the words in wordlist"
+ self.load_index(wordlist=wordlist)
+ entries = {}
+ hits = copy.copy(self.fileids) # Copy of fileids index
+ for word in wordlist:
+ if not self.casesensitive:
+ word = string.upper(word)
+ entry = self.words.get(word) # For each word, get index
+ entries[word] = entry # of matching files
+ if not entry: # Nothing for this one word (fail)
+ return 0
+ for fileid in hits.keys(): # Eliminate hits for every non-match
+ if not entry.has_key(fileid):
+ del hits[fileid]
+ if print_report:
+ self.print_report(hits, wordlist, entries)
+ return hits
+
+ def print_report(self, hits={}, wordlist=[], entries={}):
+ # Figure out what to actually print (based on QUIET level)
+ output = []
+ for fileid,fname in hits.items():
+ message = fname
+ if self.quiet <= 3:
+ wordcount = self.files[fname][1]
+ matches = 0
+ countmess = '\n'+' '*13+`wordcount`+' words; '
+ for word in wordlist:
+ if not self.casesensitive:
+ word = string.upper(word)
+ occurs = entries[word][fileid]
+ matches = matches+occurs
+ countmess = countmess +`occurs`+' '+word+'; '
+ message = string.ljust('[RATING: '
+ +`1000*matches/wordcount`+']',13)+message
+ if self.quiet <= 2: message = message +countmess +'\n'
+ if self.filter: # Using an output filter
+ if fnmatch.fnmatch(message, self.filter):
+ output.append(message)
+ else:
+ output.append(message)
+
+ if self.quiet <= 5:
+ print string.join(output,'\n')
+ sys.stderr.write('\n'+`len(output)`+' files matched wordlist: '+
+ `wordlist`+'\n')
+ return output
+
+ def purge_entry(self, fname, file_dct, word_dct):
+ "Remove a file from file index and word index"
+ try: # The easy part, cleanup the file index
+ file_index = file_dct[fname]
+ del file_dct[fname]
+ except KeyError:
+ pass # We'll assume we only encounter KeyError's
+ # The much harder part, cleanup the word index
+ for word, occurs in word_dct.items():
+ if occurs.has_key(file_index):
+ del occurs[file_index]
+ word_dct[word] = occurs
+
+ def index_loaded(self):
+ return ( hasattr(self,'fileids') and
+ hasattr(self,'files') and
+ hasattr(self,'words') )
+
+#-- Provide an actual storage facility for the indexes (i.e. shelve)
+class ShelveIndexer(GenericIndexer, TextSplitter):
+ """Concrete Indexer utilizing [shelve] for storage
+
+ Unfortunately, [shelve] proves far too slow in indexing, while
+ creating monstrously large indexes. Not recommend, at least under
+ the default dbm's tested. Also, class may be broken because
+ shelves do not, apparently, support the .values() and .items()
+ methods. Fixing this is a low priority, but the sample code is
+ left here.
+ """
+ def load_index(self, INDEXDB=None, reload=0, wordlist=None):
+ INDEXDB = INDEXDB or self.indexdb
+ import shelve
+ self.words = shelve.open(INDEXDB+".WORDS")
+ self.files = shelve.open(INDEXDB+".FILES")
+ self.fileids = shelve.open(INDEXDB+".FILEIDS")
+ if not FILES: # New index
+ self.files['_TOP'] = (0,None)
+
+ def save_index(self, INDEXDB=None):
+ INDEXDB = INDEXDB or self.indexdb
+ pass
+
+class FlatIndexer(GenericIndexer, TextSplitter):
+ """Concrete Indexer utilizing flat-file for storage
+
+ See the comments in the referenced article for details; in
+ brief, this indexer has about the same timing as the best in
+ -creating- indexes and the storage requirements are
+ reasonable. However, actually -using- a flat-file index is
+ more than an order of magnitude worse than the best indexer
+ (ZPickleIndexer wins overall).
+
+ On the other hand, FlatIndexer creates a wonderfully easy to
+ parse database format if you have a reason to transport the
+ index to a different platform or programming language. And
+ should you perform indexing as part of a long-running
+ process, the overhead of initial file parsing becomes
+ irrelevant.
+ """
+ def load_index(self, INDEXDB=None, reload=0, wordlist=None):
+ # Unless reload is indicated, do not load twice
+ if self.index_loaded() and not reload: return 0
+ # Ok, now let's actually load it
+ INDEXDB = INDEXDB or self.indexdb
+ self.words = {}
+ self.files = {'_TOP':(0,None)}
+ self.fileids = {}
+ try: # Read index contents
+ for line in open(INDEXDB).readlines():
+ fields = string.split(line)
+ if fields[0] == '-': # Read a file/fileid line
+ fileid = eval(fields[2])
+ wordcount = eval(fields[3])
+ fname = fields[1]
+ self.files[fname] = (fileid, wordcount)
+ self.fileids[fileid] = fname
+ else: # Read a word entry (dict of hits)
+ entries = {}
+ word = fields[0]
+ for n in range(1,len(fields),2):
+ fileid = eval(fields[n])
+ occurs = eval(fields[n+1])
+ entries[fileid] = occurs
+ self.words[word] = entries
+ except:
+ pass # New index
+
+ def save_index(self, INDEXDB=None):
+ INDEXDB = INDEXDB or self.indexdb
+ tab, lf, sp = '\t','\n',' '
+ indexdb = open(INDEXDB,'w')
+ for fname,entry in self.files.items():
+ indexdb.write('- '+fname +tab +`entry[0]` +tab +`entry[1]` +lf)
+ for word,entry in self.words.items():
+ indexdb.write(word +tab+tab)
+ for fileid,occurs in entry.items():
+ indexdb.write(`fileid` +sp +`occurs` +sp)
+ indexdb.write(lf)
+
+class PickleIndexer(GenericIndexer, TextSplitter):
+ def load_index(self, INDEXDB=None, reload=0, wordlist=None):
+ # Unless reload is indicated, do not load twice
+ if self.index_loaded() and not reload: return 0
+ # Ok, now let's actually load it
+ import cPickle
+ INDEXDB = INDEXDB or self.indexdb
+ try:
+ pickle_str = open(INDEXDB,'rb').read()
+ db = cPickle.loads(pickle_str)
+ except: # New index
+ db = Index({}, {'_TOP':(0,None)}, {})
+ self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS
+
+ def save_index(self, INDEXDB=None):
+ import cPickle
+ INDEXDB = INDEXDB or self.indexdb
+ db = Index(self.words, self.files, self.fileids)
+ open(INDEXDB,'wb').write(cPickle.dumps(db, 1))
+
+class XMLPickleIndexer(PickleIndexer):
+ """Concrete Indexer utilizing XML for storage
+
+ While this is, as expected, a verbose format, the possibility
+ of using XML as a transport format for indexes might be
+ useful. However, [xml_pickle] is in need of some redesign to
+ avoid gross inefficiency when creating very large
+ (multi-megabyte) output files (fixed in [xml_pickle] version
+ 0.48 or above)
+ """
+ def load_index(self, INDEXDB=None, reload=0, wordlist=None):
+ # Unless reload is indicated, do not load twice
+ if self.index_loaded() and not reload: return 0
+ # Ok, now let's actually load it
+ from gnosis.xml.pickle import XML_Pickler
+ INDEXDB = INDEXDB or self.indexdb
+ try: # XML file exists
+ xml_str = open(INDEXDB).read()
+ db = XML_Pickler().loads(xml_str)
+ except: # New index
+ db = Index({}, {'_TOP':(0,None)}, {})
+ self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS
+
+ def save_index(self, INDEXDB=None):
+ from gnosis.xml.pickle import XML_Pickler
+ INDEXDB = INDEXDB or self.indexdb
+ db = Index(self.words, self.files, self.fileids)
+ open(INDEXDB,'w').write(XML_Pickler(db).dumps())
+
+class ZPickleIndexer(PickleIndexer):
+ def load_index(self, INDEXDB=None, reload=0, wordlist=None):
+ # Unless reload is indicated, do not load twice
+ if self.index_loaded() and not reload: return 0
+ # Ok, now let's actually load it
+ import cPickle, zlib
+ INDEXDB = INDEXDB or self.indexdb
+ try:
+ pickle_str = zlib.decompress(open(INDEXDB+'!','rb').read())
+ db = cPickle.loads(pickle_str)
+ except: # New index
+ db = Index({}, {'_TOP':(0,None)}, {})
+ self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS
+
+ def save_index(self, INDEXDB=None):
+ import cPickle, zlib
+ INDEXDB = INDEXDB or self.indexdb
+ db = Index(self.words, self.files, self.fileids)
+ pickle_fh = open(INDEXDB+'!','wb')
+ pickle_fh.write(zlib.compress(cPickle.dumps(db, 1)))
+
+
+class SlicedZPickleIndexer(ZPickleIndexer):
+ segments = "ABCDEFGHIJKLMNOPQRSTUVWXYZ#-!"
+ def load_index(self, INDEXDB=None, reload=0, wordlist=None):
+ # Unless reload is indicated, do not load twice
+ if self.index_loaded() and not reload: return 0
+ # Ok, now let's actually load it
+ import cPickle, zlib
+ INDEXDB = INDEXDB or self.indexdb
+ db = Index({}, {'_TOP':(0,None)}, {})
+ # Identify the relevant word-dictionary segments
+ if not wordlist:
+ segments = self.segments
+ else:
+ segments = ['-','#']
+ for word in wordlist:
+ segments.append(string.upper(word[0]))
+ # Load the segments
+ for segment in segments:
+ try:
+ pickle_str = zlib.decompress(open(INDEXDB+segment,'rb').read())
+ dbslice = cPickle.loads(pickle_str)
+ if dbslice.__dict__.get('WORDS'): # If it has some words, add them
+ for word,entry in dbslice.WORDS.items():
+ db.WORDS[word] = entry
+ if dbslice.__dict__.get('FILES'): # If it has some files, add them
+ db.FILES = dbslice.FILES
+ if dbslice.__dict__.get('FILEIDS'): # If it has fileids, add them
+ db.FILEIDS = dbslice.FILEIDS
+ except:
+ pass # No biggie, couldn't find this segment
+ self.words, self.files, self.fileids = db.WORDS, db.FILES, db.FILEIDS
+
+ def julienne(self, INDEXDB=None):
+ import cPickle, zlib
+ INDEXDB = INDEXDB or self.indexdb
+ segments = self.segments # all the (little) indexes
+ for segment in segments:
+ try: # brutal space saver... delete all the small segments
+ os.remove(INDEXDB+segment)
+ except OSError:
+ pass # probably just nonexistent segment index file
+ # First write the much simpler filename/fileid dictionaries
+ dbfil = Index(None, self.files, self.fileids)
+ open(INDEXDB+'-','wb').write(zlib.compress(cPickle.dumps(dbfil,1)))
+ # The hard part is splitting the word dictionary up, of course
+ letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ segdicts = {} # Need batch of empty dicts
+ for segment in letters+'#':
+ segdicts[segment] = {}
+ for word, entry in self.words.items(): # Split into segment dicts
+ initchar = string.upper(word[0])
+ if initchar in letters:
+ segdicts[initchar][word] = entry
+ else:
+ segdicts['#'][word] = entry
+ for initchar in letters+'#':
+ db = Index(segdicts[initchar], None, None)
+ pickle_str = cPickle.dumps(db, 1)
+ filename = INDEXDB+initchar
+ pickle_fh = open(filename,'wb')
+ pickle_fh.write(zlib.compress(pickle_str))
+ os.chmod(filename,0664)
+
+ save_index = julienne
+
+PreferredIndexer = SlicedZPickleIndexer
+
+#-- If called from command-line, parse arguments and take actions
+if __name__ == '__main__':
+ import time
+ start = time.time()
+ search_words = [] # Word search list (if specified)
+ opts = 0 # Any options specified?
+ if len(sys.argv) < 2:
+ pass # No options given
+ else:
+ upper = string.upper
+ dir = os.getcwd() # Default to indexing from current directory
+ descend = 1 # Default to recursive indexing
+ ndx = PreferredIndexer()
+ for opt in sys.argv[1:]:
+ if opt in ('-h','/h','-?','/?','?','--help'): # help screen
+ print __shell_usage__
+ opts = -1
+ break
+ elif opt[0] in '/-': # a switch!
+ opts = opts+1
+ if upper(opt[1:]) == 'INDEX': # Index files
+ ndx.quiet = 0
+ pass # Use defaults if no other options
+ elif upper(opt[1:]) == 'REINDEX': # Reindex
+ ndx.reindex = 1
+ elif upper(opt[1:]) == 'CASESENSITIVE': # Case sensitive
+ ndx.casesensitive = 1
+ elif upper(opt[1:]) in ('NORECURSE','LOCAL'): # No recursion
+ descend = 0
+ elif upper(opt[1:4]) == 'DIR': # Dir to index
+ dir = opt[5:]
+ elif upper(opt[1:8]) == 'INDEXDB': # Index specified
+ ndx.indexdb = opt[9:]
+ sys.stderr.write(
+ "Use of INDEXER_DB environment variable is STRONGLY recommended.\n")
+ elif upper(opt[1:6]) == 'REGEX': # RegEx files to index
+ ndx.add_pattern = re.compile(opt[7:])
+ elif upper(opt[1:5]) == 'GLOB': # Glob files to index
+ ndx.add_pattern = opt[6:]
+ elif upper(opt[1:7]) in ('OUTPUT','FORMAT'): # How should results look?
+ opts = opts-1 # this is not an option for indexing purposes
+ level = upper(opt[8:])
+ if level in ('ALL','EVERYTHING','VERBOSE', 'MAX'):
+ ndx.quiet = 0
+ elif level in ('RATINGS','SCORES','HIGH'):
+ ndx.quiet = 3
+ elif level in ('FILENAMES','NAMES','FILES','MID'):
+ ndx.quiet = 5
+ elif level in ('SUMMARY','MIN'):
+ ndx.quiet = 9
+ elif upper(opt[1:7]) == 'FILTER': # Regex filter output
+ opts = opts-1 # this is not an option for indexing purposes
+ ndx.filter = opt[8:]
+ elif opt[1:] in string.digits:
+ opts = opts-1
+ ndx.quiet = eval(opt[1])
+ else:
+ search_words.append(opt) # Search words
+
+ if opts > 0:
+ ndx.add_files(dir=dir)
+ ndx.save_index()
+ if search_words:
+ ndx.find(search_words, print_report=1)
+ if not opts and not search_words:
+ sys.stderr.write("Perhaps you would like to use the --help option?\n")
+ else:
+ sys.stderr.write('Processed in %.3f seconds (%s)'
+ % (time.time()-start, ndx.whoami()))
+
+#
+#$Log: not supported by cvs2svn $
+#Revision 1.1.2.3 2002/04/03 12:05:15 rochecompaan
+#Removed dos control characters.
+#
+#Revision 1.1.2.2 2002/04/03 12:01:55 rochecompaan
+#Oops. Forgot to include cvs keywords in file.
+#
diff --git a/roundup/roundup_indexer.py b/roundup/roundup_indexer.py
--- /dev/null
@@ -0,0 +1,98 @@
+#
+# Copyright (c) 2001 Bizar Software Pty Ltd (http://www.bizarsoftware.com.au/)
+# This module is free software, and you may redistribute it and/or modify
+# under the same terms as Python, so long as this copyright message and
+# disclaimer are retained in their original form.
+#
+# IN NO EVENT SHALL BIZAR SOFTWARE PTY LTD BE LIABLE TO ANY PARTY FOR
+# DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING
+# OUT OF THE USE OF THIS CODE, EVEN IF THE AUTHOR HAS BEEN ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# BIZAR SOFTWARE PTY LTD SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
+# BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+# FOR A PARTICULAR PURPOSE. THE CODE PROVIDED HEREUNDER IS ON AN "AS IS"
+# BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE,
+# SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+#
+#$Id: roundup_indexer.py,v 1.2 2002-05-25 07:16:24 rochecompaan Exp $
+'''
+This module provides an indexer class, RoundupIndexer, that stores text
+indices in a roundup instance. This class makes searching the content of
+messages and text files possible.
+'''
+import os
+from roundup.indexer import SlicedZPickleIndexer
+
+class RoundupIndexer(SlicedZPickleIndexer):
+ ''' Indexes messages and files
+ '''
+
+ def __init__(self, db_path):
+ indexdb_path = os.path.join(db_path, 'indexes')
+ index_exists = 0
+ if not os.path.exists(indexdb_path):
+ os.makedirs(indexdb_path)
+ os.chmod(indexdb_path, 0775)
+ else:
+ index_exists = 1
+ index_path = os.path.join(indexdb_path, 'index.db')
+ SlicedZPickleIndexer.__init__(self,
+ INDEXDB=index_path, QUIET=9)
+ files_path = os.path.join(db_path, 'files')
+ if not index_exists:
+ self.add_files(dir=files_path)
+ self.save_index()
+
+ def search(self, search_terms, klass):
+ ''' display search results
+ '''
+ hits = self.find(search_terms)
+ links = []
+ nodeids = {}
+ designator_propname = {'msg': 'messages',
+ 'file': 'files'}
+ if hits:
+ hitcount = len(hits)
+ # build a dictionary of nodes and their associated messages
+ # and files
+ for hit in hits.keys():
+ filename = hits[hit].split('/')[-1]
+ for designator, propname in designator_propname.items():
+ if filename.find(designator) == -1: continue
+ nodeid = filename[len(designator):]
+ result = apply(klass.find, (), {propname:nodeid})
+ if not result: continue
+
+ id = str(result[0])
+ if not nodeids.has_key(id):
+ nodeids[id] = {}
+
+ node_dict = nodeids[id]
+ if not node_dict.has_key(propname):
+ node_dict[propname] = [nodeid]
+ elif node_dict.has_key(propname):
+ node_dict[propname].append(nodeid)
+
+ return nodeids
+
+
+#
+#$Log: not supported by cvs2svn $
+#Revision 1.1.2.3 2002/05/02 11:52:12 rochecompaan
+#Fixed small bug that prevented indexes from being generated.
+#
+#Revision 1.1.2.2 2002/04/19 19:54:42 rochecompaan
+#cgi_client.py
+# removed search link for the time being
+# moved rendering of matches to htmltemplate
+#hyperdb.py
+# filtering of nodes on full text search incorporated in filter method
+#roundupdb.py
+# added paramater to call of filter method
+#roundup_indexer.py
+# added search method to RoundupIndexer class
+#
+#Revision 1.1.2.1 2002/04/03 11:55:57 rochecompaan
+# . Added feature #526730 - search for messages capability
+#
diff --git a/roundup/roundupdb.py b/roundup/roundupdb.py
index f870bb99a0e4b75e60accffc5d66b1dfdebaf2f5..f874db8cbc6336b89e84608d66201f10de4eafe6 100644 (file)
--- a/roundup/roundupdb.py
+++ b/roundup/roundupdb.py
# BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE,
# SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
#
-# $Id: roundupdb.py,v 1.52 2002-05-15 03:27:16 richard Exp $
+# $Id: roundupdb.py,v 1.53 2002-05-25 07:16:24 rochecompaan Exp $
__doc__ = """
Extending hyperdb with types specific to issue-tracking.
# try the user alternate addresses if possible
props = self.user.getprops()
if props.has_key('alternate_addresses'):
- users = self.user.filter({'alternate_addresses': address},
+ users = self.user.filter(None, {'alternate_addresses': address},
[], [])
user = extractUserFromList(self.user, users)
if user is not None: return user
#
# $Log: not supported by cvs2svn $
+# Revision 1.52 2002/05/15 03:27:16 richard
+# . fixed SCRIPT_NAME in ZRoundup for instances not at top level of Zope
+# (thanks dman)
+# . fixed some sorting issues that were breaking some unit tests under py2.2
+# . mailgw test output dir was confusing the init test (but only on 2.2 *shrug*)
+#
+# fixed bug in the init unit test that meant only the bsddb test ran if it
+# could (it clobbered the anydbm test)
+#
# Revision 1.51 2002/04/08 03:46:42 richard
# make it work
#
# The initial detector is one that we'll be using here at ekit - it bounces new
# issue messages to a team address.
#
+# Revision 1.49.2.1 2002/04/19 19:54:42 rochecompaan
+# cgi_client.py
+# removed search link for the time being
+# moved rendering of matches to htmltemplate
+# hyperdb.py
+# filtering of nodes on full text search incorporated in filter method
+# roundupdb.py
+# added paramater to call of filter method
+# roundup_indexer.py
+# added search method to RoundupIndexer class
+#
# Revision 1.49 2002/03/19 06:41:49 richard
# Faster, easier, less mess ;)
#
diff --git a/roundup/templates/classic/instance_config.py b/roundup/templates/classic/instance_config.py
index 0a599aec6e471d32b8e5f0044e8b4b8e281c358f..cd06f426ff99d3268d29a074304bd17d5a790442 100644 (file)
# BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE,
# SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
#
-# $Id: instance_config.py,v 1.17 2002-05-22 00:32:33 richard Exp $
+# $Id: instance_config.py,v 1.18 2002-05-25 07:16:25 rochecompaan Exp $
MAIL_DOMAIN=MAILHOST=HTTP_HOST=None
HTTP_PORT=0
# list the classes that users are able to add nodes to
HEADER_ADD_LINKS = ['issue']
+# list the classes that users can search
+HEADER_SEARCH_LINKS = ['issue']
+
+# list search filters per class
+SEARCH_FILTERS = ['ISSUE_FILTER', 'SUPPORT_FILTER']
+
# Now the DEFAULT display specification. TODO: describe format
DEFAULT_INDEX = {
'LABEL': 'All Issues',
},
}
+ISSUE_FILTER = {
+ 'CLASS': 'issue',
+ 'FILTER': ['status', 'priority', 'assignedto', 'creator']
+}
+
+SUPPORT_FILTER = {
+ 'CLASS': 'issue',
+ 'FILTER': ['status', 'priority', 'assignedto', 'creator']
+}
+
#
# $Log: not supported by cvs2svn $
+# Revision 1.17 2002/05/22 00:32:33 richard
+# . changed the default message list in issues to display the message body
+# . made backends.__init__ be more specific about which ImportErrors it really
+# wants to ignore
+# . fixed the example addresses in the templates to use correct example domains
+# . cleaned out the template stylesheets, removing a bunch of junk that really
+# wasn't necessary (font specs, styles never used) and added a style for
+# message content
+#
# Revision 1.16 2002/05/21 06:05:54 richard
# . #551483 ] assignedto in Client.make_index_link
#
# the config variables EMAIL_KEEP_QUOTED_TEST and
# EMAIL_LEAVE_BODY_UNCHANGED.
#
+# Revision 1.13.2.2 2002/05/02 11:49:19 rochecompaan
+# Allow customization of the search filters that should be displayed
+# on the search page.
+#
+# Revision 1.13.2.1 2002/04/20 13:23:33 rochecompaan
+# We now have a separate search page for nodes. Search links for
+# different classes can be customized in instance_config similar to
+# index links.
+#
# Revision 1.13 2002/03/14 23:59:24 richard
# . #517734 ] web header customisation is obscure
#
diff --git a/roundup/templates/extended/instance_config.py b/roundup/templates/extended/instance_config.py
index b10c039a17b4c6609befc9c0a9b7aff67fb91363..ec6a3bf8f2fcaa7cc403849e9098782ae99b533a 100644 (file)
# BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE,
# SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
#
-# $Id: instance_config.py,v 1.17 2002-05-22 00:32:34 richard Exp $
+# $Id: instance_config.py,v 1.18 2002-05-25 07:16:25 rochecompaan Exp $
MAIL_DOMAIN=MAILHOST=HTTP_HOST=None
HTTP_PORT=0
# list the classes that users are able to add nodes to
HEADER_ADD_LINKS = ['issue', 'support']
+# list the classes that users can search
+HEADER_SEARCH_LINKS = ['issue', 'support']
+
+SEARCH_FILTERS = ['ISSUE_FILTER', 'SUPPORT_FILTER']
+
# Now the DEFAULT display specifications. TODO: describe format
DEFAULT_INDEX = {
'LABEL': 'All Issues',
},
}
+ISSUE_FILTER = {
+ 'CLASS': 'issue',
+ 'FILTER': ['status', 'priority', 'assignedto', 'creator']
+}
+
+SUPPORT_FILTER = {
+ 'CLASS': 'issue',
+ 'FILTER': ['status', 'priority', 'assignedto', 'creator']
+}
+
#
# $Log: not supported by cvs2svn $
+# Revision 1.17 2002/05/22 00:32:34 richard
+# . changed the default message list in issues to display the message body
+# . made backends.__init__ be more specific about which ImportErrors it really
+# wants to ignore
+# . fixed the example addresses in the templates to use correct example domains
+# . cleaned out the template stylesheets, removing a bunch of junk that really
+# wasn't necessary (font specs, styles never used) and added a style for
+# message content
+#
# Revision 1.16 2002/05/21 06:05:54 richard
# . #551483 ] assignedto in Client.make_index_link
#
# the config variables EMAIL_KEEP_QUOTED_TEST and
# EMAIL_LEAVE_BODY_UNCHANGED.
#
+# Revision 1.13.2.2 2002/05/02 11:49:19 rochecompaan
+# Allow customization of the search filters that should be displayed
+# on the search page.
+#
+# Revision 1.13.2.1 2002/04/20 13:23:34 rochecompaan
+# We now have a separate search page for nodes. Search links for
+# different classes can be customized in instance_config similar to
+# index links.
+#
# Revision 1.13 2002/03/14 23:59:24 richard
# . #517734 ] web header customisation is obscure
#
diff --git a/run_tests b/run_tests
index 032b7c96cfaeaf617d0189e7f46be88a580d127a..54eb006681aa3c4556e123ba667faf792bdb5638 100755 (executable)
--- a/run_tests
+++ b/run_tests
-#! /usr/bin/env python
+#! /usr/bin/env python2.2
#
# Copyright (c) 2001 Richard Jones
# This module is free software, and you may redistribute it and/or modify
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
-# $Id: run_tests,v 1.4 2002-02-14 23:38:12 richard Exp $
+# $Id: run_tests,v 1.5 2002-05-25 07:16:23 rochecompaan Exp $
from test import go
import sys
#
# $Log: not supported by cvs2svn $
+# Revision 1.4 2002/02/14 23:38:12 richard
+# Fixed the unit tests for the mailgw re: the x-roundup-name header.
+# Also made the test runner more user-friendly:
+# ./run_tests - detect all tests in test/test_<name>.py and run them
+# ./run_tests <name> - run only test/test_<name>.py
+# eg ./run_tests mailgw - run the mailgw test from test/test_mailgw.py
+#
# Revision 1.3 2002/01/23 20:09:41 jhermann
# Proper fix for failing test
#