roundup/indexer.py

   1 #
   2 # This module is derived from the module described at:
   3 #   http://gnosis.cx/publish/programming/charming_python_15.txt
   4 #
   5 # Author: David Mertz (mertz@gnosis.cx)
   6 # Thanks to: Pat Knight (p.knight@ktgroup.co.uk)
   7 #            Gregory Popovitch (greg@gpy.com)
   8 #
   9 # The original module was released under this license, and remains under
  10 # it:
  11 #
  12 #     This file is released to the public domain.  I (dqm) would
  13 #     appreciate it if you choose to keep derived works under terms
  14 #     that promote freedom, but obviously am giving up any rights
  15 #     to compel such.
  16 #
  17 #$Id: indexer.py,v 1.16 2003-01-14 03:56:44 richard Exp $
  18 '''
  19 This module provides an indexer class, RoundupIndexer, that stores text
  20 indices in a roundup instance.  This class makes searching the content of
  21 messages, string properties and text files possible.
  22 '''
  23 import os, shutil, re, mimetypes, marshal, zlib, errno
  24 from hyperdb import Link, Multilink
  25
  26 class Indexer:
  27     ''' Indexes information from roundup's hyperdb to allow efficient
  28         searching.
  29
  30         Three structures are created by the indexer:
  31           files   {identifier: (fileid, wordcount)}
  32           words   {word: {fileid: count}}
  33           fileids {fileid: identifier}
  34         where identifier is (classname, nodeid, propertyname)
  35     '''
  36     def __init__(self, db_path):
  37         self.indexdb_path = os.path.join(db_path, 'indexes')
  38         self.indexdb = os.path.join(self.indexdb_path, 'index.db')
  39         self.reindex = 0
  40         self.quiet = 9
  41         self.changed = 0
  42
  43         # see if we need to reindex because of a change in code
  44         version = os.path.join(self.indexdb_path, 'version')
  45         if (not os.path.exists(self.indexdb_path) or
  46                 not os.path.exists(version)):
  47             # for now the file itself is a flag
  48             self.force_reindex()
  49         elif os.path.exists(version):
  50             version = open(version).read()
  51             # check the value and reindex if it's not the latest
  52             if version.strip() != '1':
  53                 self.force_reindex()
  54
  55     def force_reindex(self):
  56         '''Force a reindex condition
  57         '''
  58         if os.path.exists(self.indexdb_path):
  59             shutil.rmtree(self.indexdb_path)
  60         os.makedirs(self.indexdb_path)
  61         os.chmod(self.indexdb_path, 0775)
  62         open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n')
  63         self.reindex = 1
  64         self.changed = 1
  65
  66     def should_reindex(self):
  67         '''Should we reindex?
  68         '''
  69         return self.reindex
  70
  71     def add_text(self, identifier, text, mime_type='text/plain'):
  72         ''' Add some text associated with the (classname, nodeid, property)
  73             identifier.
  74         '''
  75         # make sure the index is loaded
  76         self.load_index()
  77
  78         # remove old entries for this identifier
  79         if self.files.has_key(identifier):
  80             self.purge_entry(identifier)
  81
  82         # split into words
  83         words = self.splitter(text, mime_type)
  84
  85         # Find new file index, and assign it to identifier
  86         # (_TOP uses trick of negative to avoid conflict with file index)
  87         self.files['_TOP'] = (self.files['_TOP'][0]-1, None)
  88         file_index = abs(self.files['_TOP'][0])
  89         self.files[identifier] = (file_index, len(words))
  90         self.fileids[file_index] = identifier
  91
  92         # find the unique words
  93         filedict = {}
  94         for word in words:
  95             if filedict.has_key(word):
  96                 filedict[word] = filedict[word]+1
  97             else:
  98                 filedict[word] = 1
  99
 100         # now add to the totals
 101         for word in filedict.keys():
 102             # each word has a dict of {identifier: count}
 103             if self.words.has_key(word):
 104                 entry = self.words[word]
 105             else:
 106                 # new word
 107                 entry = {}
 108                 self.words[word] = entry
 109
 110             # make a reference to the file for this word
 111             entry[file_index] = filedict[word]
 112
 113         # save needed
 114         self.changed = 1
 115
 116     def splitter(self, text, ftype):
 117         ''' Split the contents of a text string into a list of 'words'
 118         '''
 119         if ftype == 'text/plain':
 120             words = self.text_splitter(text)
 121         else:
 122             return []
 123         return words
 124
 125     def text_splitter(self, text):
 126         """Split text/plain string into a list of words
 127         """
 128         # case insensitive
 129         text = text.upper()
 130
 131         # Split the raw text, losing anything longer than 25 characters
 132         # since that'll be gibberish (encoded text or somesuch) or shorter
 133         # than 3 characters since those short words appear all over the
 134         # place
 135         return re.findall(r'\b\w{2,25}\b', text)
 136
 137     def search(self, search_terms, klass, ignore={},
 138             dre=re.compile(r'([^\d]+)(\d+)')):
 139         ''' Display search results looking for [search, terms] associated
 140             with the hyperdb Class "klass". Ignore hits on {class: property}.
 141
 142             "dre" is a helper, not an argument.
 143         '''
 144         # do the index lookup
 145         hits = self.find(search_terms)
 146         if not hits:
 147             return {}
 148
 149         #designator_propname = {'msg': 'messages', 'file': 'files'}
 150         designator_propname = {}
 151         for nm, propclass in klass.getprops().items():
 152             if isinstance(propclass, Link) or isinstance(propclass, Multilink):
 153                 designator_propname[propclass.classname] = nm
 154
 155         # build a dictionary of nodes and their associated messages
 156         # and files
 157         nodeids = {}    # this is the answer
 158         propspec = {}     # used to do the klass.find
 159         for propname in designator_propname.values():
 160             propspec[propname] = {}   # used as a set (value doesn't matter)
 161         for classname, nodeid, property in hits.values():
 162             # skip this result if we don't care about this class/property
 163             if ignore.has_key((classname, property)):
 164                 continue
 165
 166             # if it's a property on klass, it's easy
 167             if classname == klass.classname:
 168                 if not nodeids.has_key(nodeid):
 169                     nodeids[nodeid] = {}
 170                 continue
 171
 172             # make sure the class is a linked one, otherwise ignore
 173             if not designator_propname.has_key(classname):
 174                 continue
 175
 176             # it's a linked class - set up to do the klass.find
 177             linkprop = designator_propname[classname]   # eg, msg -> messages
 178             propspec[linkprop][nodeid] = 1
 179
 180         # retain only the meaningful entries
 181         for propname, idset in propspec.items():
 182             if not idset:
 183                 del propspec[propname]
 184
 185         # klass.find tells me the klass nodeids the linked nodes relate to
 186         for resid in klass.find(**propspec):
 187             resid = str(resid)
 188             if not nodeids.has_key(id):
 189                 nodeids[resid] = {}
 190             node_dict = nodeids[resid]
 191             # now figure out where it came from
 192             for linkprop in propspec.keys():
 193                 for nodeid in klass.get(resid, linkprop):
 194                     if propspec[linkprop].has_key(nodeid):
 195                         # OK, this node[propname] has a winner
 196                         if not node_dict.has_key(linkprop):
 197                             node_dict[linkprop] = [nodeid]
 198                         else:
 199                             node_dict[linkprop].append(nodeid)
 200         return nodeids
 201
 202     # we override this to ignore not 2 < word < 25 and also to fix a bug -
 203     # the (fail) case.
 204     def find(self, wordlist):
 205         ''' Locate files that match ALL the words in wordlist
 206         '''
 207         if not hasattr(self, 'words'):
 208             self.load_index()
 209         self.load_index(wordlist=wordlist)
 210         entries = {}
 211         hits = None
 212         for word in wordlist:
 213             if not 2 < len(word) < 25:
 214                 # word outside the bounds of what we index - ignore
 215                 continue
 216             word = word.upper()
 217             entry = self.words.get(word)    # For each word, get index
 218             entries[word] = entry           #   of matching files
 219             if not entry:                   # Nothing for this one word (fail)
 220                 return {}
 221             if hits is None:
 222                 hits = {}
 223                 for k in entry.keys():
 224                     if not self.fileids.has_key(k):
 225                         raise ValueError, 'Index is corrupted: re-generate it'
 226                     hits[k] = self.fileids[k]
 227             else:
 228                 # Eliminate hits for every non-match
 229                 for fileid in hits.keys():
 230                     if not entry.has_key(fileid):
 231                         del hits[fileid]
 232         if hits is None:
 233             return {}
 234         return hits
 235
 236     segments = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_-!"
 237     def load_index(self, reload=0, wordlist=None):
 238         # Unless reload is indicated, do not load twice
 239         if self.index_loaded() and not reload:
 240             return 0
 241
 242         # Ok, now let's actually load it
 243         db = {'WORDS': {}, 'FILES': {'_TOP':(0,None)}, 'FILEIDS': {}}
 244
 245         # Identify the relevant word-dictionary segments
 246         if not wordlist:
 247             segments = self.segments
 248         else:
 249             segments = ['-','#']
 250             for word in wordlist:
 251                 segments.append(word[0].upper())
 252
 253         # Load the segments
 254         for segment in segments:
 255             try:
 256                 f = open(self.indexdb + segment, 'rb')
 257             except IOError, error:
 258                 # probably just nonexistent segment index file
 259                 if error.errno != errno.ENOENT: raise
 260             else:
 261                 pickle_str = zlib.decompress(f.read())
 262                 f.close()
 263                 dbslice = marshal.loads(pickle_str)
 264                 if dbslice.get('WORDS'):
 265                     # if it has some words, add them
 266                     for word, entry in dbslice['WORDS'].items():
 267                         db['WORDS'][word] = entry
 268                 if dbslice.get('FILES'):
 269                     # if it has some files, add them
 270                     db['FILES'] = dbslice['FILES']
 271                 if dbslice.get('FILEIDS'):
 272                     # if it has fileids, add them
 273                     db['FILEIDS'] = dbslice['FILEIDS']
 274
 275         self.words = db['WORDS']
 276         self.files = db['FILES']
 277         self.fileids = db['FILEIDS']
 278         self.changed = 0
 279
 280     def save_index(self):
 281         # only save if the index is loaded and changed
 282         if not self.index_loaded() or not self.changed:
 283             return
 284
 285         # brutal space saver... delete all the small segments
 286         for segment in self.segments:
 287             try:
 288                 os.remove(self.indexdb + segment)
 289             except OSError, error:
 290                 # probably just nonexistent segment index file
 291                 if error.errno != errno.ENOENT: raise
 292
 293         # First write the much simpler filename/fileid dictionaries
 294         dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids}
 295         open(self.indexdb+'-','wb').write(zlib.compress(marshal.dumps(dbfil)))
 296
 297         # The hard part is splitting the word dictionary up, of course
 298         letters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_"
 299         segdicts = {}                           # Need batch of empty dicts
 300         for segment in letters:
 301             segdicts[segment] = {}
 302         for word, entry in self.words.items():  # Split into segment dicts
 303             initchar = word[0].upper()
 304             segdicts[initchar][word] = entry
 305
 306         # save
 307         for initchar in letters:
 308             db = {'WORDS':segdicts[initchar], 'FILES':None, 'FILEIDS':None}
 309             pickle_str = marshal.dumps(db)
 310             filename = self.indexdb + initchar
 311             pickle_fh = open(filename, 'wb')
 312             pickle_fh.write(zlib.compress(pickle_str))
 313             os.chmod(filename, 0664)
 314
 315         # save done
 316         self.changed = 0
 317
 318     def purge_entry(self, identifier):
 319         ''' Remove a file from file index and word index
 320         '''
 321         self.load_index()
 322
 323         if not self.files.has_key(identifier):
 324             return
 325
 326         file_index = self.files[identifier][0]
 327         del self.files[identifier]
 328         del self.fileids[file_index]
 329
 330         # The much harder part, cleanup the word index
 331         for key, occurs in self.words.items():
 332             if occurs.has_key(file_index):
 333                 del occurs[file_index]
 334
 335         # save needed
 336         self.changed = 1
 337
 338     def index_loaded(self):
 339         return (hasattr(self,'fileids') and hasattr(self,'files') and
 340             hasattr(self,'words'))
 341
 342 # vim: set filetype=python ts=4 sw=4 et si