From 3ee3a04f9c1a4d3a911e633347c72d61be685019 Mon Sep 17 00:00:00 2001 From: richard Date: Tue, 9 Jul 2002 04:19:09 +0000 Subject: [PATCH] Added reindex command to roundup-admin. Fixed reindex on first access. Also fixed reindexing of entries that change. git-svn-id: http://svn.roundup-tracker.org/svnroot/roundup/trunk@840 57a73879-2fb5-44c3-a270-3262357dd7e2 --- roundup/admin.py | 16 ++++++- roundup/backends/back_anydbm.py | 24 ++++++++-- roundup/cgi_client.py | 11 +++-- roundup/htmltemplate.py | 8 +++- roundup/indexer.py | 80 ++++++++++++++++++++------------- roundup/roundupdb.py | 20 ++++++++- test/test_db.py | 43 ++++++++++++++++-- 7 files changed, 153 insertions(+), 49 deletions(-) diff --git a/roundup/admin.py b/roundup/admin.py index 481ca46..e54ef7c 100644 --- a/roundup/admin.py +++ b/roundup/admin.py @@ -16,7 +16,7 @@ # BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE, # SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. # -# $Id: admin.py,v 1.15 2002-06-17 23:14:44 richard Exp $ +# $Id: admin.py,v 1.16 2002-07-09 04:19:09 richard Exp $ import sys, os, getpass, getopt, re, UserDict, shlex, shutil try: @@ -964,6 +964,17 @@ Date format is "YYYY-MM-DD" eg: self.db.pack(pack_before) return 0 + def do_reindex(self, args): + '''Usage: reindex + Re-generate an instance's search indexes. + + This will re-generate the search indexes for an instance. This will + typically happen automatically. + ''' + self.db.indexer.force_reindex() + self.db.reindex() + return 0 + def run_command(self, args): '''Run a single command ''' @@ -1114,6 +1125,9 @@ if __name__ == '__main__': # # $Log: not supported by cvs2svn $ +# Revision 1.15 2002/06/17 23:14:44 richard +# . #569415 ] {version} +# # Revision 1.14 2002/06/11 06:41:50 richard # Removed prompt for admin email in initialisation. # diff --git a/roundup/backends/back_anydbm.py b/roundup/backends/back_anydbm.py index d2473db..b341c44 100644 --- a/roundup/backends/back_anydbm.py +++ b/roundup/backends/back_anydbm.py @@ -15,7 +15,7 @@ # BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE, # SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. # -#$Id: back_anydbm.py,v 1.39 2002-07-09 03:02:52 richard Exp $ +#$Id: back_anydbm.py,v 1.40 2002-07-09 04:19:09 richard Exp $ ''' This module defines a backend that saves the hyperdatabase in a database chosen by anydbm. It is guaranteed to always be available in python @@ -69,8 +69,10 @@ class Database(FileStorage, hyperdb.Database): def post_init(self): """Called once the schema initialisation has finished.""" # reindex the db if necessary - if not self.indexer.should_reindex(): - return + if self.indexer.should_reindex(): + self.reindex() + + def reindex(self): for klass in self.classes.values(): for nodeid in klass.list(): klass.index(nodeid) @@ -507,6 +509,22 @@ class Database(FileStorage, hyperdb.Database): # #$Log: not supported by cvs2svn $ +#Revision 1.39 2002/07/09 03:02:52 richard +#More indexer work: +#- all String properties may now be indexed too. Currently there's a bit of +# "issue" specific code in the actual searching which needs to be +# addressed. In a nutshell: +# + pass 'indexme="yes"' as a String() property initialisation arg, eg: +# file = FileClass(db, "file", name=String(), type=String(), +# comment=String(indexme="yes")) +# + the comment will then be indexed and be searchable, with the results +# related back to the issue that the file is linked to +#- as a result of this work, the FileClass has a default MIME type that may +# be overridden in a subclass, or by the use of a "type" property as is +# done in the default templates. +#- the regeneration of the indexes (if necessary) is done once the schema is +# set up in the dbinit. +# #Revision 1.38 2002/07/08 06:58:15 richard #cleaned up the indexer code: # - it splits more words out (much simpler, faster splitter) diff --git a/roundup/cgi_client.py b/roundup/cgi_client.py index 83380f6..b471c1b 100644 --- a/roundup/cgi_client.py +++ b/roundup/cgi_client.py @@ -15,7 +15,7 @@ # BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE, # SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. # -# $Id: cgi_client.py,v 1.133 2002-07-08 15:32:05 gmcm Exp $ +# $Id: cgi_client.py,v 1.134 2002-07-09 04:19:09 richard Exp $ __doc__ = """ WWW request handler (also used in the stand-alone server). @@ -26,7 +26,6 @@ import binascii, Cookie, time, random import roundupdb, htmltemplate, date, hyperdb, password from roundup.i18n import _ -from roundup.indexer import Indexer class Unauthorised(ValueError): pass @@ -73,10 +72,6 @@ class Client: # someone gave us a non-int debug level, turn it off self.debug = 0 - # used for searching the indexes - self.indexer = Indexer('%s/db'%instance.INSTANCE_HOME) - - def getuid(self): try: return self.db.user.lookup(self.user) @@ -1459,6 +1454,10 @@ def parsePropsFromForm(db, cl, form, nodeid=0, num_re=re.compile('^\d+$')): # # $Log: not supported by cvs2svn $ +# Revision 1.133 2002/07/08 15:32:05 gmcm +# Pagination of index pages. +# New search form. +# # Revision 1.132 2002/07/08 07:26:14 richard # ehem # diff --git a/roundup/htmltemplate.py b/roundup/htmltemplate.py index 2541ee3..65e9992 100644 --- a/roundup/htmltemplate.py +++ b/roundup/htmltemplate.py @@ -15,7 +15,7 @@ # BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE, # SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. # -# $Id: htmltemplate.py,v 1.95 2002-07-08 15:32:06 gmcm Exp $ +# $Id: htmltemplate.py,v 1.96 2002-07-09 04:19:09 richard Exp $ __doc__ = """ Template engine. @@ -874,7 +874,7 @@ class IndexTemplate(TemplateFunctions): matches = None if nodeids is None: if search_text != '': - matches = self.client.indexer.search( + matches = self.db.indexer.search( search_text.split(' '), self.cl) nodeids = self.cl.filter(matches, filterspec, sort, group) for nodeid in nodeids[startwith:startwith+pagesize]: @@ -1237,6 +1237,10 @@ class NewItemTemplate(TemplateFunctions): # # $Log: not supported by cvs2svn $ +# Revision 1.95 2002/07/08 15:32:06 gmcm +# Pagination of index pages. +# New search form. +# # Revision 1.94 2002/06/27 15:38:53 gmcm # Fix the cycles (a clear method, called after render, that removes # the bound methods from the globals dict). diff --git a/roundup/indexer.py b/roundup/indexer.py index d82560c..096b6c6 100644 --- a/roundup/indexer.py +++ b/roundup/indexer.py @@ -14,7 +14,7 @@ # that promote freedom, but obviously am giving up any rights # to compel such. # -#$Id: indexer.py,v 1.4 2002-07-09 03:02:52 richard Exp $ +#$Id: indexer.py,v 1.5 2002-07-09 04:19:09 richard Exp $ ''' This module provides an indexer class, RoundupIndexer, that stores text indices in a roundup instance. This class makes searching the content of @@ -25,29 +25,35 @@ import os, shutil, re, mimetypes, marshal, zlib, errno class Indexer: ''' Indexes information from roundup's hyperdb to allow efficient searching. + + Three structures are created by the indexer: + files {identifier: (fileid, wordcount)} + words {word: {fileid: count}} + fileids {fileid: identifier} ''' def __init__(self, db_path): - indexdb_path = os.path.join(db_path, 'indexes') - self.indexdb = os.path.join(indexdb_path, 'index.db') + self.indexdb_path = os.path.join(db_path, 'indexes') + self.indexdb = os.path.join(self.indexdb_path, 'index.db') self.reindex = 0 self.casesensitive = 0 self.quiet = 9 # see if we need to reindex because of a change in code - if (not os.path.exists(indexdb_path) or - not os.path.exists(os.path.join(indexdb_path, 'version'))): + if (not os.path.exists(self.indexdb_path) or + not os.path.exists(os.path.join(self.indexdb_path, 'version'))): # TODO: if the version file exists (in the future) we'll want to # check the value in it - for now the file itself is a flag - if os.path.exists(indexdb_path): - shutil.rmtree(indexdb_path) - os.makedirs(indexdb_path) - os.chmod(indexdb_path, 0775) - open(os.path.join(indexdb_path, 'version'), 'w').write('1\n') - - # we need to reindex - self.reindex = 1 - else: - self.reindex = 0 + self.force_reindex() + + def force_reindex(self): + '''Force a reindex condition + ''' + if os.path.exists(self.indexdb_path): + shutil.rmtree(self.indexdb_path) + os.makedirs(self.indexdb_path) + os.chmod(self.indexdb_path, 0775) + open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n') + self.reindex = 1 def should_reindex(self): '''Should we reindex? @@ -61,16 +67,9 @@ class Indexer: # make sure the index is loaded self.load_index() - # Is file eligible for (re)indexing? + # remove old entries for this identifier if self.files.has_key(identifier): - # Reindexing enabled, cleanup dicts - if self.reindex: - self.purge_entry(identifier, self.files, self.words) - else: - # DO NOT reindex this file - if self.quiet < 5: - print "Not reindexing", identifier - return 0 + self.purge_entry(identifier) # split into words words = self.splitter(text, mime_type) @@ -281,19 +280,20 @@ class Indexer: pickle_fh.write(zlib.compress(pickle_str)) os.chmod(filename, 0664) - def purge_entry(self, fname, file_dct, word_dct): + def purge_entry(self, identifier): ''' Remove a file from file index and word index ''' - try: # The easy part, cleanup the file index - file_index = file_dct[fname] - del file_dct[fname] - except KeyError: - pass # We'll assume we only encounter KeyError's + if not self.files.has_key(identifier): + return + + file_index = self.files[identifier][0] + del self.files[identifier] + del self.fileids[file_index] + # The much harder part, cleanup the word index - for word, occurs in word_dct.items(): + for key, occurs in self.words.items(): if occurs.has_key(file_index): del occurs[file_index] - word_dct[word] = occurs def index_loaded(self): return (hasattr(self,'fileids') and hasattr(self,'files') and @@ -301,6 +301,22 @@ class Indexer: # #$Log: not supported by cvs2svn $ +#Revision 1.4 2002/07/09 03:02:52 richard +#More indexer work: +#- all String properties may now be indexed too. Currently there's a bit of +# "issue" specific code in the actual searching which needs to be +# addressed. In a nutshell: +# + pass 'indexme="yes"' as a String() property initialisation arg, eg: +# file = FileClass(db, "file", name=String(), type=String(), +# comment=String(indexme="yes")) +# + the comment will then be indexed and be searchable, with the results +# related back to the issue that the file is linked to +#- as a result of this work, the FileClass has a default MIME type that may +# be overridden in a subclass, or by the use of a "type" property as is +# done in the default templates. +#- the regeneration of the indexes (if necessary) is done once the schema is +# set up in the dbinit. +# #Revision 1.3 2002/07/08 06:58:15 richard #cleaned up the indexer code: # - it splits more words out (much simpler, faster splitter) diff --git a/roundup/roundupdb.py b/roundup/roundupdb.py index 03bde0d..dc181a4 100644 --- a/roundup/roundupdb.py +++ b/roundup/roundupdb.py @@ -15,7 +15,7 @@ # BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE, # SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. # -# $Id: roundupdb.py,v 1.60 2002-07-09 03:02:52 richard Exp $ +# $Id: roundupdb.py,v 1.61 2002-07-09 04:19:09 richard Exp $ __doc__ = """ Extending hyperdb with types specific to issue-tracking. @@ -313,7 +313,7 @@ class IssueClass(Class): dictionary attempts to specify any of these properties or a "creation" or "activity" property, a ValueError is raised.""" if not properties.has_key('title'): - properties['title'] = hyperdb.String() + properties['title'] = hyperdb.String(indexme='yes') if not properties.has_key('messages'): properties['messages'] = hyperdb.Multilink("msg") if not properties.has_key('files'): @@ -691,6 +691,22 @@ class IssueClass(Class): # # $Log: not supported by cvs2svn $ +# Revision 1.60 2002/07/09 03:02:52 richard +# More indexer work: +# - all String properties may now be indexed too. Currently there's a bit of +# "issue" specific code in the actual searching which needs to be +# addressed. In a nutshell: +# + pass 'indexme="yes"' as a String() property initialisation arg, eg: +# file = FileClass(db, "file", name=String(), type=String(), +# comment=String(indexme="yes")) +# + the comment will then be indexed and be searchable, with the results +# related back to the issue that the file is linked to +# - as a result of this work, the FileClass has a default MIME type that may +# be overridden in a subclass, or by the use of a "type" property as is +# done in the default templates. +# - the regeneration of the indexes (if necessary) is done once the schema is +# set up in the dbinit. +# # Revision 1.59 2002/06/18 03:55:25 dman13 # Fixed name/address display problem introduced by an earlier change. # (instead of "name" display "name ") diff --git a/test/test_db.py b/test/test_db.py index 814c17a..e8a51b0 100644 --- a/test/test_db.py +++ b/test/test_db.py @@ -15,7 +15,7 @@ # BASIS, AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE, # SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. # -# $Id: test_db.py,v 1.24 2002-07-09 03:02:53 richard Exp $ +# $Id: test_db.py,v 1.25 2002-07-09 04:19:09 richard Exp $ import unittest, os, shutil @@ -314,8 +314,29 @@ class anydbmDBTestCase(MyTestCase): {'2': {}}) self.assertEquals(self.db.indexer.search(['flebble'], self.db.issue), {'2': {}, '1': {}}) - self.assertEquals(self.db.indexer.search(['blah'], self.db.issue), - {'1': {'files': ['2']}}) + + def testReindexing(self): + self.db.issue.create(title="frooz") + self.db.commit() + self.assertEquals(self.db.indexer.search(['frooz'], self.db.issue), + {'1': {}}) + self.db.issue.set('1', title="dooble") + self.db.commit() + self.assertEquals(self.db.indexer.search(['dooble'], self.db.issue), + {'1': {}}) + self.assertEquals(self.db.indexer.search(['frooz'], self.db.issue), {}) + + def testForcedReindexing(self): + self.db.issue.create(title="flebble frooz") + self.db.commit() + self.assertEquals(self.db.indexer.search(['flebble'], self.db.issue), + {'1': {}}) + self.db.indexer.quiet = 1 + self.db.indexer.force_reindex() + self.db.post_init() + self.db.indexer.quiet = 9 + self.assertEquals(self.db.indexer.search(['flebble'], self.db.issue), + {'1': {}}) class anydbmReadOnlyDBTestCase(MyTestCase): def setUp(self): @@ -419,6 +440,22 @@ def suite(): # # $Log: not supported by cvs2svn $ +# Revision 1.24 2002/07/09 03:02:53 richard +# More indexer work: +# - all String properties may now be indexed too. Currently there's a bit of +# "issue" specific code in the actual searching which needs to be +# addressed. In a nutshell: +# + pass 'indexme="yes"' as a String() property initialisation arg, eg: +# file = FileClass(db, "file", name=String(), type=String(), +# comment=String(indexme="yes")) +# + the comment will then be indexed and be searchable, with the results +# related back to the issue that the file is linked to +# - as a result of this work, the FileClass has a default MIME type that may +# be overridden in a subclass, or by the use of a "type" property as is +# done in the default templates. +# - the regeneration of the indexes (if necessary) is done once the schema is +# set up in the dbinit. +# # Revision 1.23 2002/06/20 23:51:48 richard # Cleaned up the hyperdb tests # -- 2.30.2