roundup/indexer.py

   1 #
   2 # This module is derived from the module described at:
   3 #   http://gnosis.cx/publish/programming/charming_python_15.txt
   4 #
   5 # Author: David Mertz (mertz@gnosis.cx)
   6 # Thanks to: Pat Knight (p.knight@ktgroup.co.uk)
   7 #            Gregory Popovitch (greg@gpy.com)
   8 #
   9 # The original module was released under this license, and remains under
  10 # it:
  11 #
  12 #     This file is released to the public domain.  I (dqm) would
  13 #     appreciate it if you choose to keep derived works under terms
  14 #     that promote freedom, but obviously am giving up any rights
  15 #     to compel such.
  16 #
  17 #$Id: indexer.py,v 1.12 2002-07-19 03:36:33 richard Exp $
  18 '''
  19 This module provides an indexer class, RoundupIndexer, that stores text
  20 indices in a roundup instance.  This class makes searching the content of
  21 messages, string properties and text files possible.
  22 '''
  23 import os, shutil, re, mimetypes, marshal, zlib, errno
  24 from hyperdb import Link, Multilink
  25
  26 class Indexer:
  27     ''' Indexes information from roundup's hyperdb to allow efficient
  28         searching.
  29
  30         Three structures are created by the indexer:
  31           files   {identifier: (fileid, wordcount)}
  32           words   {word: {fileid: count}}
  33           fileids {fileid: identifier}
  34         where identifier is (classname, nodeid, propertyname)
  35     '''
  36     def __init__(self, db_path):
  37         self.indexdb_path = os.path.join(db_path, 'indexes')
  38         self.indexdb = os.path.join(self.indexdb_path, 'index.db')
  39         self.reindex = 0
  40         self.quiet = 9
  41         self.changed = 0
  42
  43         # see if we need to reindex because of a change in code
  44         version = os.path.join(self.indexdb_path, 'version')
  45         if (not os.path.exists(self.indexdb_path) or
  46                 not os.path.exists(version)):
  47             # for now the file itself is a flag
  48             self.force_reindex()
  49         elif os.path.exists(version):
  50             version = open(version).read()
  51             # check the value and reindex if it's not the latest
  52             if version.strip() != '1':
  53                 self.force_reindex()
  54
  55     def force_reindex(self):
  56         '''Force a reindex condition
  57         '''
  58         if os.path.exists(self.indexdb_path):
  59             shutil.rmtree(self.indexdb_path)
  60         os.makedirs(self.indexdb_path)
  61         os.chmod(self.indexdb_path, 0775)
  62         open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n')
  63         self.reindex = 1
  64         self.changed = 1
  65
  66     def should_reindex(self):
  67         '''Should we reindex?
  68         '''
  69         return self.reindex
  70
  71     def add_text(self, identifier, text, mime_type='text/plain'):
  72         ''' Add some text associated with the (classname, nodeid, property)
  73             identifier.
  74         '''
  75         # make sure the index is loaded
  76         self.load_index()
  77
  78         # remove old entries for this identifier
  79         if self.files.has_key(identifier):
  80             self.purge_entry(identifier)
  81
  82         # split into words
  83         words = self.splitter(text, mime_type)
  84
  85         # Find new file index, and assign it to identifier
  86         # (_TOP uses trick of negative to avoid conflict with file index)
  87         self.files['_TOP'] = (self.files['_TOP'][0]-1, None)
  88         file_index = abs(self.files['_TOP'][0])
  89         self.files[identifier] = (file_index, len(words))
  90         self.fileids[file_index] = identifier
  91
  92         # find the unique words
  93         filedict = {}
  94         for word in words:
  95             if filedict.has_key(word):
  96                 filedict[word] = filedict[word]+1
  97             else:
  98                 filedict[word] = 1
  99
 100         # now add to the totals
 101         for word in filedict.keys():
 102             # each word has a dict of {identifier: count}
 103             if self.words.has_key(word):
 104                 entry = self.words[word]
 105             else:
 106                 # new word
 107                 entry = {}
 108                 self.words[word] = entry
 109
 110             # make a reference to the file for this word
 111             entry[file_index] = filedict[word]
 112
 113         # save needed
 114         self.changed = 1
 115
 116     def splitter(self, text, ftype):
 117         ''' Split the contents of a text string into a list of 'words'
 118         '''
 119         if ftype == 'text/plain':
 120             words = self.text_splitter(text)
 121         else:
 122             return []
 123         return words
 124
 125     def text_splitter(self, text):
 126         """Split text/plain string into a list of words
 127         """
 128         # case insensitive
 129         text = text.upper()
 130
 131         # Split the raw text, losing anything longer than 25 characters
 132         # since that'll be gibberish (encoded text or somesuch) or shorter
 133         # than 3 characters since those short words appear all over the
 134         # place
 135         return re.findall(r'\b\w{2,25}\b', text)
 136
 137     def search(self, search_terms, klass, ignore={},
 138             dre=re.compile(r'([^\d]+)(\d+)')):
 139         ''' Display search results looking for [search, terms] associated
 140             with the hyperdb Class "klass". Ignore hits on {class: property}.
 141
 142             "dre" is a helper, not an argument.
 143         '''
 144         # do the index lookup
 145         hits = self.find(search_terms)
 146         if not hits:
 147             return {}
 148
 149         #designator_propname = {'msg': 'messages', 'file': 'files'}
 150         designator_propname = {}
 151         for nm, propclass in klass.getprops().items():
 152             if isinstance(propclass, Link) or isinstance(propclass, Multilink):
 153                 designator_propname[propclass.classname] = nm
 154
 155         # build a dictionary of nodes and their associated messages
 156         # and files
 157         nodeids = {}    # this is the answer
 158         propspec = {}     # used to do the klass.find
 159         for propname in designator_propname.values():
 160             propspec[propname] = {}   # used as a set (value doesn't matter)
 161         for classname, nodeid, property in hits.values():
 162             # skip this result if we don't care about this class/property
 163             if ignore.has_key((classname, property)):
 164                 continue
 165
 166             # if it's a property on klass, it's easy
 167             if classname == klass.classname:
 168                 if not nodeids.has_key(nodeid):
 169                     nodeids[nodeid] = {}
 170                 continue
 171
 172             # it's a linked class - set up to do the klass.find
 173             linkprop = designator_propname[classname]   # eg, msg -> messages
 174             propspec[linkprop][nodeid] = 1
 175
 176         # retain only the meaningful entries
 177         for propname, idset in propspec.items():
 178             if not idset:
 179                 del propspec[propname]
 180
 181         # klass.find tells me the klass nodeids the linked nodes relate to
 182         for resid in klass.find(**propspec):
 183             resid = str(resid)
 184             if not nodeids.has_key(id):
 185                 nodeids[resid] = {}
 186             node_dict = nodeids[resid]
 187             # now figure out where it came from
 188             for linkprop in propspec.keys():
 189                 for nodeid in klass.get(resid, linkprop):
 190                     if propspec[linkprop].has_key(nodeid):
 191                         # OK, this node[propname] has a winner
 192                         if not node_dict.has_key(linkprop):
 193                             node_dict[linkprop] = [nodeid]
 194                         else:
 195                             node_dict[linkprop].append(nodeid)
 196         return nodeids
 197
 198     # we override this to ignore not 2 < word < 25 and also to fix a bug -
 199     # the (fail) case.
 200     def find(self, wordlist):
 201         ''' Locate files that match ALL the words in wordlist
 202         '''
 203         if not hasattr(self, 'words'):
 204             self.load_index()
 205         self.load_index(wordlist=wordlist)
 206         entries = {}
 207         hits = None
 208         for word in wordlist:
 209             if not 2 < len(word) < 25:
 210                 # word outside the bounds of what we index - ignore
 211                 continue
 212             word = word.upper()
 213             entry = self.words.get(word)    # For each word, get index
 214             entries[word] = entry           #   of matching files
 215             if not entry:                   # Nothing for this one word (fail)
 216                 return {}
 217             if hits is None:
 218                 hits = {}
 219                 for k in entry.keys():
 220                     hits[k] = self.fileids[k]
 221             else:
 222                 # Eliminate hits for every non-match
 223                 for fileid in hits.keys():
 224                     if not entry.has_key(fileid):
 225                         del hits[fileid]
 226         if hits is None:
 227             return {}
 228         return hits
 229
 230     segments = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_-!"
 231     def load_index(self, reload=0, wordlist=None):
 232         # Unless reload is indicated, do not load twice
 233         if self.index_loaded() and not reload:
 234             return 0
 235
 236         # Ok, now let's actually load it
 237         db = {'WORDS': {}, 'FILES': {'_TOP':(0,None)}, 'FILEIDS': {}}
 238
 239         # Identify the relevant word-dictionary segments
 240         if not wordlist:
 241             segments = self.segments
 242         else:
 243             segments = ['-','#']
 244             for word in wordlist:
 245                 segments.append(word[0].upper())
 246
 247         # Load the segments
 248         for segment in segments:
 249             try:
 250                 f = open(self.indexdb + segment, 'rb')
 251             except IOError, error:
 252                 # probably just nonexistent segment index file
 253                 if error.errno != errno.ENOENT: raise
 254             else:
 255                 pickle_str = zlib.decompress(f.read())
 256                 f.close()
 257                 dbslice = marshal.loads(pickle_str)
 258                 if dbslice.get('WORDS'):
 259                     # if it has some words, add them
 260                     for word, entry in dbslice['WORDS'].items():
 261                         db['WORDS'][word] = entry
 262                 if dbslice.get('FILES'):
 263                     # if it has some files, add them
 264                     db['FILES'] = dbslice['FILES']
 265                 if dbslice.get('FILEIDS'):
 266                     # if it has fileids, add them
 267                     db['FILEIDS'] = dbslice['FILEIDS']
 268
 269         self.words = db['WORDS']
 270         self.files = db['FILES']
 271         self.fileids = db['FILEIDS']
 272         self.changed = 0
 273
 274     def save_index(self):
 275         # only save if the index is loaded and changed
 276         if not self.index_loaded() or not self.changed:
 277             return
 278
 279         # brutal space saver... delete all the small segments
 280         for segment in self.segments:
 281             try:
 282                 os.remove(self.indexdb + segment)
 283             except OSError, error:
 284                 # probably just nonexistent segment index file
 285                 if error.errno != errno.ENOENT: raise
 286
 287         # First write the much simpler filename/fileid dictionaries
 288         dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids}
 289         open(self.indexdb+'-','wb').write(zlib.compress(marshal.dumps(dbfil)))
 290
 291         # The hard part is splitting the word dictionary up, of course
 292         letters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_"
 293         segdicts = {}                           # Need batch of empty dicts
 294         for segment in letters:
 295             segdicts[segment] = {}
 296         for word, entry in self.words.items():  # Split into segment dicts
 297             initchar = word[0].upper()
 298             segdicts[initchar][word] = entry
 299
 300         # save
 301         for initchar in letters:
 302             db = {'WORDS':segdicts[initchar], 'FILES':None, 'FILEIDS':None}
 303             pickle_str = marshal.dumps(db)
 304             filename = self.indexdb + initchar
 305             pickle_fh = open(filename, 'wb')
 306             pickle_fh.write(zlib.compress(pickle_str))
 307             os.chmod(filename, 0664)
 308
 309         # save done
 310         self.changed = 0
 311
 312     def purge_entry(self, identifier):
 313         ''' Remove a file from file index and word index
 314         '''
 315         self.load_index()
 316
 317         if not self.files.has_key(identifier):
 318             return
 319
 320         file_index = self.files[identifier][0]
 321         del self.files[identifier]
 322         del self.fileids[file_index]
 323
 324         # The much harder part, cleanup the word index
 325         for key, occurs in self.words.items():
 326             if occurs.has_key(file_index):
 327                 del occurs[file_index]
 328
 329         # save needed
 330         self.changed = 1
 331
 332     def index_loaded(self):
 333         return (hasattr(self,'fileids') and hasattr(self,'files') and
 334             hasattr(self,'words'))
 335
 336 #
 337 #$Log: not supported by cvs2svn $
 338 #Revision 1.11  2002/07/18 11:17:30  gmcm
 339 #Add Number and Boolean types to hyperdb.
 340 #Add conversion cases to web, mail & admin interfaces.
 341 #Add storage/serialization cases to back_anydbm & back_metakit.
 342 #
 343 #Revision 1.10  2002/07/14 23:17:24  richard
 344 #oops
 345 #
 346 #Revision 1.9  2002/07/14 06:11:16  richard
 347 #Some TODOs
 348 #
 349 #Revision 1.8  2002/07/09 21:53:38  gmcm
 350 #Optimize Class.find so that the propspec can contain a set of ids to match.
 351 #This is used by indexer.search so it can do just one find for all the index matches.
 352 #This was already confusing code, but for common terms (lots of index matches),
 353 #it is enormously faster.
 354 #
 355 #Revision 1.7  2002/07/09 21:38:43  richard
 356 #Only save the index if the thing is loaded and changed. Also, don't load
 357 #the index just for a save.
 358 #
 359 #Revision 1.6  2002/07/09 04:26:44  richard
 360 #We're indexing numbers now, and _underscore words
 361 #
 362 #Revision 1.5  2002/07/09 04:19:09  richard
 363 #Added reindex command to roundup-admin.
 364 #Fixed reindex on first access.
 365 #Also fixed reindexing of entries that change.
 366 #
 367 #Revision 1.4  2002/07/09 03:02:52  richard
 368 #More indexer work:
 369 #- all String properties may now be indexed too. Currently there's a bit of
 370 #  "issue" specific code in the actual searching which needs to be
 371 #  addressed. In a nutshell:
 372 #  + pass 'indexme="yes"' as a String() property initialisation arg, eg:
 373 #        file = FileClass(db, "file", name=String(), type=String(),
 374 #            comment=String(indexme="yes"))
 375 #  + the comment will then be indexed and be searchable, with the results
 376 #    related back to the issue that the file is linked to
 377 #- as a result of this work, the FileClass has a default MIME type that may
 378 #  be overridden in a subclass, or by the use of a "type" property as is
 379 #  done in the default templates.
 380 #- the regeneration of the indexes (if necessary) is done once the schema is
 381 #  set up in the dbinit.
 382 #
 383 #Revision 1.3  2002/07/08 06:58:15  richard
 384 #cleaned up the indexer code:
 385 # - it splits more words out (much simpler, faster splitter)
 386 # - removed code we'll never use (roundup.roundup_indexer has the full
 387 #   implementation, and replaces roundup.indexer)
 388 # - only index text/plain and rfc822/message (ideas for other text formats to
 389 #   index are welcome)
 390 # - added simple unit test for indexer. Needs more tests for regression.
 391 #
 392 #Revision 1.2  2002/05/25 07:16:24  rochecompaan
 393 #Merged search_indexing-branch with HEAD
 394 #
 395 #Revision 1.1.2.3  2002/05/02 11:52:12  rochecompaan
 396 #Fixed small bug that prevented indexes from being generated.
 397 #
 398 #Revision 1.1.2.2  2002/04/19 19:54:42  rochecompaan
 399 #cgi_client.py
 400 #    removed search link for the time being
 401 #    moved rendering of matches to htmltemplate
 402 #hyperdb.py
 403 #    filtering of nodes on full text search incorporated in filter method
 404 #roundupdb.py
 405 #    added paramater to call of filter method
 406 #roundup_indexer.py
 407 #    added search method to RoundupIndexer class
 408 #
 409 #Revision 1.1.2.1  2002/04/03 11:55:57  rochecompaan
 410 # . Added feature #526730 - search for messages capability
 411 #