roundup/indexer.py

   1 #
   2 # This module is derived from the module described at:
   3 #   http://gnosis.cx/publish/programming/charming_python_15.txt
   4 #
   5 # Author: David Mertz (mertz@gnosis.cx)
   6 # Thanks to: Pat Knight (p.knight@ktgroup.co.uk)
   7 #            Gregory Popovitch (greg@gpy.com)
   8 #
   9 # The original module was released under this license, and remains under
  10 # it:
  11 #
  12 #     This file is released to the public domain.  I (dqm) would
  13 #     appreciate it if you choose to keep derived works under terms
  14 #     that promote freedom, but obviously am giving up any rights
  15 #     to compel such.
  16 #
  17 #$Id: indexer.py,v 1.8 2002-07-09 21:53:38 gmcm Exp $
  18 '''
  19 This module provides an indexer class, RoundupIndexer, that stores text
  20 indices in a roundup instance.  This class makes searching the content of
  21 messages, string properties and text files possible.
  22 '''
  23 import os, shutil, re, mimetypes, marshal, zlib, errno
  24 from hyperdb import Link, Multilink
  25
  26 class Indexer:
  27     ''' Indexes information from roundup's hyperdb to allow efficient
  28         searching.
  29
  30         Three structures are created by the indexer:
  31           files   {identifier: (fileid, wordcount)}
  32           words   {word: {fileid: count}}
  33           fileids {fileid: identifier}
  34         where identifier is (classname, nodeid, propertyname)
  35     '''
  36     def __init__(self, db_path):
  37         self.indexdb_path = os.path.join(db_path, 'indexes')
  38         self.indexdb = os.path.join(self.indexdb_path, 'index.db')
  39         self.reindex = 0
  40         self.quiet = 9
  41         self.changed = 0
  42
  43         # see if we need to reindex because of a change in code
  44         if (not os.path.exists(self.indexdb_path) or
  45                 not os.path.exists(os.path.join(self.indexdb_path, 'version'))):
  46             # TODO: if the version file exists (in the future) we'll want to
  47             # check the value in it - for now the file itself is a flag
  48             self.force_reindex()
  49
  50     def force_reindex(self):
  51         '''Force a reindex condition
  52         '''
  53         if os.path.exists(self.indexdb_path):
  54             shutil.rmtree(self.indexdb_path)
  55         os.makedirs(self.indexdb_path)
  56         os.chmod(self.indexdb_path, 0775)
  57         open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n')
  58         self.reindex = 1
  59         self.changed = 1
  60
  61     def should_reindex(self):
  62         '''Should we reindex?
  63         '''
  64         return self.reindex
  65
  66     def add_text(self, identifier, text, mime_type='text/plain'):
  67         ''' Add some text associated with the (classname, nodeid, property)
  68             identifier.
  69         '''
  70         # make sure the index is loaded
  71         self.load_index()
  72
  73         # remove old entries for this identifier
  74         if self.files.has_key(identifier):
  75             self.purge_entry(identifier)
  76
  77         # split into words
  78         words = self.splitter(text, mime_type)
  79
  80         # Find new file index, and assign it to identifier
  81         # (_TOP uses trick of negative to avoid conflict with file index)
  82         self.files['_TOP'] = (self.files['_TOP'][0]-1, None)
  83         file_index = abs(self.files['_TOP'][0])
  84         self.files[identifier] = (file_index, len(words))
  85         self.fileids[file_index] = identifier
  86
  87         # find the unique words
  88         filedict = {}
  89         for word in words:
  90             if filedict.has_key(word):
  91                 filedict[word] = filedict[word]+1
  92             else:
  93                 filedict[word] = 1
  94
  95         # now add to the totals
  96         for word in filedict.keys():
  97             # each word has a dict of {identifier: count}
  98             if self.words.has_key(word):
  99                 entry = self.words[word]
 100             else:
 101                 # new word
 102                 entry = {}
 103                 self.words[word] = entry
 104
 105             # make a reference to the file for this word
 106             entry[file_index] = filedict[word]
 107
 108         # save needed
 109         self.changed = 1
 110
 111     def splitter(self, text, ftype):
 112         ''' Split the contents of a text string into a list of 'words'
 113         '''
 114         if ftype == 'text/plain':
 115             words = self.text_splitter(text)
 116         else:
 117             return []
 118         return words
 119
 120     def text_splitter(self, text):
 121         """Split text/plain string into a list of words
 122         """
 123         # case insensitive
 124         text = text.upper()
 125
 126         # Split the raw text, losing anything longer than 25 characters
 127         # since that'll be gibberish (encoded text or somesuch) or shorter
 128         # than 3 characters since those short words appear all over the
 129         # place
 130         return re.findall(r'\b\w{2,25}\b', text)
 131
 132     def search(self, search_terms, klass, ignore={},
 133             dre=re.compile(r'([^\d]+)(\d+)')):
 134         ''' Display search results looking for [search, terms] associated
 135             with the hyperdb Class "klass". Ignore hits on {class: property}.
 136
 137             "dre" is a helper, not an argument.
 138         '''
 139         # do the index lookup
 140         hits = self.find(search_terms)
 141         if not hits:
 142             return {}
 143
 144         #designator_propname = {'msg': 'messages', 'file': 'files'}
 145         designator_propname = {}
 146         for nm, propclass in klass.getprops().items():
 147             if isinstance(propclass, Link) or isinstance(propclass, Multilink):
 148                 designator_propname[propclass.classname] = nm
 149
 150         # build a dictionary of nodes and their associated messages
 151         # and files
 152         nodeids = {}    # this is the answer
 153         propspec = {}     # used to do the klass.find
 154         for propname in designator_propname.values():
 155             propspec[propname] = {}   # used as a set (value doesn't matter)
 156         for classname, nodeid, property in hits.values():
 157             # skip this result if we don't care about this class/property
 158             if ignore.has_key((classname, property)):
 159                 continue
 160
 161             # if it's a property on klass, it's easy
 162             if classname == klass.classname:
 163                 if not nodeids.has_key(nodeid):
 164                     nodeids[nodeid] = {}
 165                 continue
 166
 167             # it's a linked class - set up to do the klass.find
 168             linkprop = designator_propname[classname]   # eg, msg -> messages
 169             propspec[linkprop][nodeid] = 1
 170
 171         # retain only the meaningful entries
 172         for propname, idset in propspec.items():
 173             if not idset:
 174                 del propspec[propname]
 175
 176         # klass.find tells me the klass nodeids the linked nodes relate to
 177         for resid in klass.find(**propspec):
 178             resid = str(resid)
 179             if not nodeids.has_key(id):
 180                 nodeids[resid] = {}
 181             node_dict = nodeids[resid]
 182             # now figure out where it came from
 183             for linkprop in propspec.keys():
 184                 for nodeid in klass.get(resid, linkprop):
 185                     if propspec[linkprop].has_key(nodeid):
 186                         # OK, this node[propname] has a winner
 187                         if not node_dict.has_key(linkprop):
 188                             node_dict[linkprop] = [nodeid]
 189                         else:
 190                             node_dict[linkprop].append(nodeid)
 191         return nodeids
 192
 193     # we override this to ignore not 2 < word < 25 and also to fix a bug -
 194     # the (fail) case.
 195     def find(self, wordlist):
 196         ''' Locate files that match ALL the words in wordlist
 197         '''
 198         if not hasattr(self, 'words'):
 199             self.load_index()
 200         self.load_index(wordlist=wordlist)
 201         entries = {}
 202         hits = None
 203         for word in wordlist:
 204             if not 2 < len(word) < 25:
 205                 # word outside the bounds of what we index - ignore
 206                 continue
 207             word = word.upper()
 208             entry = self.words.get(word)    # For each word, get index
 209             entries[word] = entry           #   of matching files
 210             if not entry:                   # Nothing for this one word (fail)
 211                 return {}
 212             if hits is None:
 213                 hits = {}
 214                 for k in entry.keys():
 215                     hits[k] = self.fileids[k]
 216             else:
 217                 # Eliminate hits for every non-match
 218                 for fileid in hits.keys():
 219                     if not entry.has_key(fileid):
 220                         del hits[fileid]
 221         if hits is None:
 222             return {}
 223         return hits
 224
 225     segments = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_-!"
 226     def load_index(self, reload=0, wordlist=None):
 227         # Unless reload is indicated, do not load twice
 228         if self.index_loaded() and not reload:
 229             return 0
 230
 231         # Ok, now let's actually load it
 232         db = {'WORDS': {}, 'FILES': {'_TOP':(0,None)}, 'FILEIDS': {}}
 233
 234         # Identify the relevant word-dictionary segments
 235         if not wordlist:
 236             segments = self.segments
 237         else:
 238             segments = ['-','#']
 239             for word in wordlist:
 240                 segments.append(word[0].upper())
 241
 242         # Load the segments
 243         for segment in segments:
 244             try:
 245                 f = open(self.indexdb + segment, 'rb')
 246             except IOError, error:
 247                 if error.errno != errno.ENOENT:
 248                     raise
 249             else:
 250                 pickle_str = zlib.decompress(f.read())
 251                 f.close()
 252                 dbslice = marshal.loads(pickle_str)
 253                 if dbslice.get('WORDS'):
 254                     # if it has some words, add them
 255                     for word, entry in dbslice['WORDS'].items():
 256                         db['WORDS'][word] = entry
 257                 if dbslice.get('FILES'):
 258                     # if it has some files, add them
 259                     db['FILES'] = dbslice['FILES']
 260                 if dbslice.get('FILEIDS'):
 261                     # if it has fileids, add them
 262                     db['FILEIDS'] = dbslice['FILEIDS']
 263
 264         self.words = db['WORDS']
 265         self.files = db['FILES']
 266         self.fileids = db['FILEIDS']
 267         self.changed = 0
 268
 269     def save_index(self):
 270         # only save if the index is loaded and changed
 271         if not self.index_loaded() or not self.changed:
 272             return
 273
 274         # brutal space saver... delete all the small segments
 275         for segment in self.segments:
 276             try:
 277                 os.remove(self.indexdb + segment)
 278             except OSError:
 279                 # probably just nonexistent segment index file
 280                 # TODO: make sure it's an EEXIST
 281                 pass
 282
 283         # First write the much simpler filename/fileid dictionaries
 284         dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids}
 285         open(self.indexdb+'-','wb').write(zlib.compress(marshal.dumps(dbfil)))
 286
 287         # The hard part is splitting the word dictionary up, of course
 288         letters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_"
 289         segdicts = {}                           # Need batch of empty dicts
 290         for segment in letters:
 291             segdicts[segment] = {}
 292         for word, entry in self.words.items():  # Split into segment dicts
 293             initchar = word[0].upper()
 294             segdicts[initchar][word] = entry
 295
 296         # save
 297         for initchar in letters:
 298             db = {'WORDS':segdicts[initchar], 'FILES':None, 'FILEIDS':None}
 299             pickle_str = marshal.dumps(db)
 300             filename = self.indexdb + initchar
 301             pickle_fh = open(filename, 'wb')
 302             pickle_fh.write(zlib.compress(pickle_str))
 303             os.chmod(filename, 0664)
 304
 305         # save done
 306         self.changed = 0
 307
 308     def purge_entry(self, identifier):
 309         ''' Remove a file from file index and word index
 310         '''
 311         if not self.files.has_key(identifier):
 312             return
 313
 314         file_index = self.files[identifier][0]
 315         del self.files[identifier]
 316         del self.fileids[file_index]
 317
 318         # The much harder part, cleanup the word index
 319         for key, occurs in self.words.items():
 320             if occurs.has_key(file_index):
 321                 del occurs[file_index]
 322
 323         # save needed
 324         self.changed = 1
 325
 326     def index_loaded(self):
 327         return (hasattr(self,'fileids') and hasattr(self,'files') and
 328             hasattr(self,'words'))
 329
 330 #
 331 #$Log: not supported by cvs2svn $
 332 #Revision 1.7  2002/07/09 21:38:43  richard
 333 #Only save the index if the thing is loaded and changed. Also, don't load
 334 #the index just for a save.
 335 #
 336 #Revision 1.6  2002/07/09 04:26:44  richard
 337 #We're indexing numbers now, and _underscore words
 338 #
 339 #Revision 1.5  2002/07/09 04:19:09  richard
 340 #Added reindex command to roundup-admin.
 341 #Fixed reindex on first access.
 342 #Also fixed reindexing of entries that change.
 343 #
 344 #Revision 1.4  2002/07/09 03:02:52  richard
 345 #More indexer work:
 346 #- all String properties may now be indexed too. Currently there's a bit of
 347 #  "issue" specific code in the actual searching which needs to be
 348 #  addressed. In a nutshell:
 349 #  + pass 'indexme="yes"' as a String() property initialisation arg, eg:
 350 #        file = FileClass(db, "file", name=String(), type=String(),
 351 #            comment=String(indexme="yes"))
 352 #  + the comment will then be indexed and be searchable, with the results
 353 #    related back to the issue that the file is linked to
 354 #- as a result of this work, the FileClass has a default MIME type that may
 355 #  be overridden in a subclass, or by the use of a "type" property as is
 356 #  done in the default templates.
 357 #- the regeneration of the indexes (if necessary) is done once the schema is
 358 #  set up in the dbinit.
 359 #
 360 #Revision 1.3  2002/07/08 06:58:15  richard
 361 #cleaned up the indexer code:
 362 # - it splits more words out (much simpler, faster splitter)
 363 # - removed code we'll never use (roundup.roundup_indexer has the full
 364 #   implementation, and replaces roundup.indexer)
 365 # - only index text/plain and rfc822/message (ideas for other text formats to
 366 #   index are welcome)
 367 # - added simple unit test for indexer. Needs more tests for regression.
 368 #
 369 #Revision 1.2  2002/05/25 07:16:24  rochecompaan
 370 #Merged search_indexing-branch with HEAD
 371 #
 372 #Revision 1.1.2.3  2002/05/02 11:52:12  rochecompaan
 373 #Fixed small bug that prevented indexes from being generated.
 374 #
 375 #Revision 1.1.2.2  2002/04/19 19:54:42  rochecompaan
 376 #cgi_client.py
 377 #    removed search link for the time being
 378 #    moved rendering of matches to htmltemplate
 379 #hyperdb.py
 380 #    filtering of nodes on full text search incorporated in filter method
 381 #roundupdb.py
 382 #    added paramater to call of filter method
 383 #roundup_indexer.py
 384 #    added search method to RoundupIndexer class
 385 #
 386 #Revision 1.1.2.1  2002/04/03 11:55:57  rochecompaan
 387 # . Added feature #526730 - search for messages capability
 388 #