roundup/indexer.py

   1 #
   2 # This module is derived from the module described at:
   3 #   http://gnosis.cx/publish/programming/charming_python_15.txt
   4 #
   5 # Author: David Mertz (mertz@gnosis.cx)
   6 # Thanks to: Pat Knight (p.knight@ktgroup.co.uk)
   7 #            Gregory Popovitch (greg@gpy.com)
   8 #
   9 # The original module was released under this license, and remains under
  10 # it:
  11 #
  12 #     This file is released to the public domain.  I (dqm) would
  13 #     appreciate it if you choose to keep derived works under terms
  14 #     that promote freedom, but obviously am giving up any rights
  15 #     to compel such.
  16 #
  17 #$Id: indexer.py,v 1.17 2004-01-20 03:58:38 richard Exp $
  18 '''
  19 This module provides an indexer class, RoundupIndexer, that stores text
  20 indices in a roundup instance.  This class makes searching the content of
  21 messages, string properties and text files possible.
  22 '''
  23 import os, shutil, re, mimetypes, marshal, zlib, errno
  24 from hyperdb import Link, Multilink
  25
  26 class Indexer:
  27     ''' Indexes information from roundup's hyperdb to allow efficient
  28         searching.
  29
  30         Three structures are created by the indexer:
  31           files   {identifier: (fileid, wordcount)}
  32           words   {word: {fileid: count}}
  33           fileids {fileid: identifier}
  34         where identifier is (classname, nodeid, propertyname)
  35     '''
  36     def __init__(self, db_path):
  37         self.indexdb_path = os.path.join(db_path, 'indexes')
  38         self.indexdb = os.path.join(self.indexdb_path, 'index.db')
  39         self.reindex = 0
  40         self.quiet = 9
  41         self.changed = 0
  42
  43         # see if we need to reindex because of a change in code
  44         version = os.path.join(self.indexdb_path, 'version')
  45         if (not os.path.exists(self.indexdb_path) or
  46                 not os.path.exists(version)):
  47             # for now the file itself is a flag
  48             self.force_reindex()
  49         elif os.path.exists(version):
  50             version = open(version).read()
  51             # check the value and reindex if it's not the latest
  52             if version.strip() != '1':
  53                 self.force_reindex()
  54
  55     def force_reindex(self):
  56         '''Force a reindex condition
  57         '''
  58         if os.path.exists(self.indexdb_path):
  59             shutil.rmtree(self.indexdb_path)
  60         os.makedirs(self.indexdb_path)
  61         os.chmod(self.indexdb_path, 0775)
  62         open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n')
  63         self.reindex = 1
  64         self.changed = 1
  65
  66     def should_reindex(self):
  67         '''Should we reindex?
  68         '''
  69         return self.reindex
  70
  71     def add_text(self, identifier, text, mime_type='text/plain'):
  72         ''' Add some text associated with the (classname, nodeid, property)
  73             identifier.
  74         '''
  75         # make sure the index is loaded
  76         self.load_index()
  77
  78         # remove old entries for this identifier
  79         if self.files.has_key(identifier):
  80             self.purge_entry(identifier)
  81
  82         # split into words
  83         words = self.splitter(text, mime_type)
  84
  85         # Find new file index, and assign it to identifier
  86         # (_TOP uses trick of negative to avoid conflict with file index)
  87         self.files['_TOP'] = (self.files['_TOP'][0]-1, None)
  88         file_index = abs(self.files['_TOP'][0])
  89         self.files[identifier] = (file_index, len(words))
  90         self.fileids[file_index] = identifier
  91
  92         # find the unique words
  93         filedict = {}
  94         for word in words:
  95             if filedict.has_key(word):
  96                 filedict[word] = filedict[word]+1
  97             else:
  98                 filedict[word] = 1
  99
 100         # now add to the totals
 101         for word in filedict.keys():
 102             # each word has a dict of {identifier: count}
 103             if self.words.has_key(word):
 104                 entry = self.words[word]
 105             else:
 106                 # new word
 107                 entry = {}
 108                 self.words[word] = entry
 109
 110             # make a reference to the file for this word
 111             entry[file_index] = filedict[word]
 112
 113         # save needed
 114         self.changed = 1
 115
 116     def splitter(self, text, ftype):
 117         ''' Split the contents of a text string into a list of 'words'
 118         '''
 119         if ftype == 'text/plain':
 120             words = self.text_splitter(text)
 121         else:
 122             return []
 123         return words
 124
 125     def text_splitter(self, text):
 126         """Split text/plain string into a list of words
 127         """
 128         # case insensitive
 129         text = text.upper()
 130
 131         # Split the raw text, losing anything longer than 25 characters
 132         # since that'll be gibberish (encoded text or somesuch) or shorter
 133         # than 3 characters since those short words appear all over the
 134         # place
 135         return re.findall(r'\b\w{2,25}\b', text)
 136
 137     def search(self, search_terms, klass, ignore={},
 138             dre=re.compile(r'([^\d]+)(\d+)')):
 139         ''' Display search results looking for [search, terms] associated
 140             with the hyperdb Class "klass". Ignore hits on {class: property}.
 141
 142             "dre" is a helper, not an argument.
 143         '''
 144         # do the index lookup
 145         hits = self.find(search_terms)
 146         if not hits:
 147             return {}
 148
 149         designator_propname = {}
 150         for nm, propclass in klass.getprops().items():
 151             if isinstance(propclass, Link) or isinstance(propclass, Multilink):
 152                 designator_propname[propclass.classname] = nm
 153
 154         # build a dictionary of nodes and their associated messages
 155         # and files
 156         nodeids = {}      # this is the answer
 157         propspec = {}     # used to do the klass.find
 158         for propname in designator_propname.values():
 159             propspec[propname] = {}   # used as a set (value doesn't matter)
 160         for classname, nodeid, property in hits.values():
 161             # skip this result if we don't care about this class/property
 162             if ignore.has_key((classname, property)):
 163                 continue
 164
 165             # if it's a property on klass, it's easy
 166             if classname == klass.classname:
 167                 if not nodeids.has_key(nodeid):
 168                     nodeids[nodeid] = {}
 169                 continue
 170
 171             # make sure the class is a linked one, otherwise ignore
 172             if not designator_propname.has_key(classname):
 173                 continue
 174
 175             # it's a linked class - set up to do the klass.find
 176             linkprop = designator_propname[classname]   # eg, msg -> messages
 177             propspec[linkprop][nodeid] = 1
 178
 179         # retain only the meaningful entries
 180         for propname, idset in propspec.items():
 181             if not idset:
 182                 del propspec[propname]
 183
 184         # klass.find tells me the klass nodeids the linked nodes relate to
 185         for resid in klass.find(**propspec):
 186             resid = str(resid)
 187             if not nodeids.has_key(id):
 188                 nodeids[resid] = {}
 189             node_dict = nodeids[resid]
 190             # now figure out where it came from
 191             for linkprop in propspec.keys():
 192                 for nodeid in klass.get(resid, linkprop):
 193                     if propspec[linkprop].has_key(nodeid):
 194                         # OK, this node[propname] has a winner
 195                         if not node_dict.has_key(linkprop):
 196                             node_dict[linkprop] = [nodeid]
 197                         else:
 198                             node_dict[linkprop].append(nodeid)
 199         return nodeids
 200
 201     # we override this to ignore not 2 < word < 25 and also to fix a bug -
 202     # the (fail) case.
 203     def find(self, wordlist):
 204         ''' Locate files that match ALL the words in wordlist
 205         '''
 206         if not hasattr(self, 'words'):
 207             self.load_index()
 208         self.load_index(wordlist=wordlist)
 209         entries = {}
 210         hits = None
 211         for word in wordlist:
 212             if not 2 < len(word) < 25:
 213                 # word outside the bounds of what we index - ignore
 214                 continue
 215             word = word.upper()
 216             entry = self.words.get(word)    # For each word, get index
 217             entries[word] = entry           #   of matching files
 218             if not entry:                   # Nothing for this one word (fail)
 219                 return {}
 220             if hits is None:
 221                 hits = {}
 222                 for k in entry.keys():
 223                     if not self.fileids.has_key(k):
 224                         raise ValueError, 'Index is corrupted: re-generate it'
 225                     hits[k] = self.fileids[k]
 226             else:
 227                 # Eliminate hits for every non-match
 228                 for fileid in hits.keys():
 229                     if not entry.has_key(fileid):
 230                         del hits[fileid]
 231         if hits is None:
 232             return {}
 233         return hits
 234
 235     segments = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_-!"
 236     def load_index(self, reload=0, wordlist=None):
 237         # Unless reload is indicated, do not load twice
 238         if self.index_loaded() and not reload:
 239             return 0
 240
 241         # Ok, now let's actually load it
 242         db = {'WORDS': {}, 'FILES': {'_TOP':(0,None)}, 'FILEIDS': {}}
 243
 244         # Identify the relevant word-dictionary segments
 245         if not wordlist:
 246             segments = self.segments
 247         else:
 248             segments = ['-','#']
 249             for word in wordlist:
 250                 segments.append(word[0].upper())
 251
 252         # Load the segments
 253         for segment in segments:
 254             try:
 255                 f = open(self.indexdb + segment, 'rb')
 256             except IOError, error:
 257                 # probably just nonexistent segment index file
 258                 if error.errno != errno.ENOENT: raise
 259             else:
 260                 pickle_str = zlib.decompress(f.read())
 261                 f.close()
 262                 dbslice = marshal.loads(pickle_str)
 263                 if dbslice.get('WORDS'):
 264                     # if it has some words, add them
 265                     for word, entry in dbslice['WORDS'].items():
 266                         db['WORDS'][word] = entry
 267                 if dbslice.get('FILES'):
 268                     # if it has some files, add them
 269                     db['FILES'] = dbslice['FILES']
 270                 if dbslice.get('FILEIDS'):
 271                     # if it has fileids, add them
 272                     db['FILEIDS'] = dbslice['FILEIDS']
 273
 274         self.words = db['WORDS']
 275         self.files = db['FILES']
 276         self.fileids = db['FILEIDS']
 277         self.changed = 0
 278
 279     def save_index(self):
 280         # only save if the index is loaded and changed
 281         if not self.index_loaded() or not self.changed:
 282             return
 283
 284         # brutal space saver... delete all the small segments
 285         for segment in self.segments:
 286             try:
 287                 os.remove(self.indexdb + segment)
 288             except OSError, error:
 289                 # probably just nonexistent segment index file
 290                 if error.errno != errno.ENOENT: raise
 291
 292         # First write the much simpler filename/fileid dictionaries
 293         dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids}
 294         open(self.indexdb+'-','wb').write(zlib.compress(marshal.dumps(dbfil)))
 295
 296         # The hard part is splitting the word dictionary up, of course
 297         letters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_"
 298         segdicts = {}                           # Need batch of empty dicts
 299         for segment in letters:
 300             segdicts[segment] = {}
 301         for word, entry in self.words.items():  # Split into segment dicts
 302             initchar = word[0].upper()
 303             segdicts[initchar][word] = entry
 304
 305         # save
 306         for initchar in letters:
 307             db = {'WORDS':segdicts[initchar], 'FILES':None, 'FILEIDS':None}
 308             pickle_str = marshal.dumps(db)
 309             filename = self.indexdb + initchar
 310             pickle_fh = open(filename, 'wb')
 311             pickle_fh.write(zlib.compress(pickle_str))
 312             os.chmod(filename, 0664)
 313
 314         # save done
 315         self.changed = 0
 316
 317     def purge_entry(self, identifier):
 318         ''' Remove a file from file index and word index
 319         '''
 320         self.load_index()
 321
 322         if not self.files.has_key(identifier):
 323             return
 324
 325         file_index = self.files[identifier][0]
 326         del self.files[identifier]
 327         del self.fileids[file_index]
 328
 329         # The much harder part, cleanup the word index
 330         for key, occurs in self.words.items():
 331             if occurs.has_key(file_index):
 332                 del occurs[file_index]
 333
 334         # save needed
 335         self.changed = 1
 336
 337     def index_loaded(self):
 338         return (hasattr(self,'fileids') and hasattr(self,'files') and
 339             hasattr(self,'words'))
 340
 341 # vim: set filetype=python ts=4 sw=4 et si