roundup/indexer.py

   1 #
   2 # This module is derived from the module described at:
   3 #   http://gnosis.cx/publish/programming/charming_python_15.txt
   4 #
   5 # Author: David Mertz (mertz@gnosis.cx)
   6 # Thanks to: Pat Knight (p.knight@ktgroup.co.uk)
   7 #            Gregory Popovitch (greg@gpy.com)
   8 #
   9 # The original module was released under this license, and remains under
  10 # it:
  11 #
  12 #     This file is released to the public domain.  I (dqm) would
  13 #     appreciate it if you choose to keep derived works under terms
  14 #     that promote freedom, but obviously am giving up any rights
  15 #     to compel such.
  16 #
  17 #$Id: indexer.py,v 1.18 2004-02-11 23:55:08 richard Exp $
  18 '''This module provides an indexer class, RoundupIndexer, that stores text
  19 indices in a roundup instance.  This class makes searching the content of
  20 messages, string properties and text files possible.
  21 '''
  22 __docformat__ = 'restructuredtext'
  23
  24 import os, shutil, re, mimetypes, marshal, zlib, errno
  25 from hyperdb import Link, Multilink
  26
  27 class Indexer:
  28     '''Indexes information from roundup's hyperdb to allow efficient
  29     searching.
  30
  31     Three structures are created by the indexer::
  32
  33           files   {identifier: (fileid, wordcount)}
  34           words   {word: {fileid: count}}
  35           fileids {fileid: identifier}
  36
  37     where identifier is (classname, nodeid, propertyname)
  38     '''
  39     def __init__(self, db_path):
  40         self.indexdb_path = os.path.join(db_path, 'indexes')
  41         self.indexdb = os.path.join(self.indexdb_path, 'index.db')
  42         self.reindex = 0
  43         self.quiet = 9
  44         self.changed = 0
  45
  46         # see if we need to reindex because of a change in code
  47         version = os.path.join(self.indexdb_path, 'version')
  48         if (not os.path.exists(self.indexdb_path) or
  49                 not os.path.exists(version)):
  50             # for now the file itself is a flag
  51             self.force_reindex()
  52         elif os.path.exists(version):
  53             version = open(version).read()
  54             # check the value and reindex if it's not the latest
  55             if version.strip() != '1':
  56                 self.force_reindex()
  57
  58     def force_reindex(self):
  59         '''Force a reindex condition
  60         '''
  61         if os.path.exists(self.indexdb_path):
  62             shutil.rmtree(self.indexdb_path)
  63         os.makedirs(self.indexdb_path)
  64         os.chmod(self.indexdb_path, 0775)
  65         open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n')
  66         self.reindex = 1
  67         self.changed = 1
  68
  69     def should_reindex(self):
  70         '''Should we reindex?
  71         '''
  72         return self.reindex
  73
  74     def add_text(self, identifier, text, mime_type='text/plain'):
  75         '''Add some text associated with the (classname, nodeid, property)
  76         identifier.
  77         '''
  78         # make sure the index is loaded
  79         self.load_index()
  80
  81         # remove old entries for this identifier
  82         if self.files.has_key(identifier):
  83             self.purge_entry(identifier)
  84
  85         # split into words
  86         words = self.splitter(text, mime_type)
  87
  88         # Find new file index, and assign it to identifier
  89         # (_TOP uses trick of negative to avoid conflict with file index)
  90         self.files['_TOP'] = (self.files['_TOP'][0]-1, None)
  91         file_index = abs(self.files['_TOP'][0])
  92         self.files[identifier] = (file_index, len(words))
  93         self.fileids[file_index] = identifier
  94
  95         # find the unique words
  96         filedict = {}
  97         for word in words:
  98             if filedict.has_key(word):
  99                 filedict[word] = filedict[word]+1
 100             else:
 101                 filedict[word] = 1
 102
 103         # now add to the totals
 104         for word in filedict.keys():
 105             # each word has a dict of {identifier: count}
 106             if self.words.has_key(word):
 107                 entry = self.words[word]
 108             else:
 109                 # new word
 110                 entry = {}
 111                 self.words[word] = entry
 112
 113             # make a reference to the file for this word
 114             entry[file_index] = filedict[word]
 115
 116         # save needed
 117         self.changed = 1
 118
 119     def splitter(self, text, ftype):
 120         '''Split the contents of a text string into a list of 'words'
 121         '''
 122         if ftype == 'text/plain':
 123             words = self.text_splitter(text)
 124         else:
 125             return []
 126         return words
 127
 128     def text_splitter(self, text):
 129         """Split text/plain string into a list of words
 130         """
 131         # case insensitive
 132         text = text.upper()
 133
 134         # Split the raw text, losing anything longer than 25 characters
 135         # since that'll be gibberish (encoded text or somesuch) or shorter
 136         # than 3 characters since those short words appear all over the
 137         # place
 138         return re.findall(r'\b\w{2,25}\b', text)
 139
 140     def search(self, search_terms, klass, ignore={},
 141             dre=re.compile(r'([^\d]+)(\d+)')):
 142         '''Display search results looking for [search, terms] associated
 143         with the hyperdb Class "klass". Ignore hits on {class: property}.
 144
 145         "dre" is a helper, not an argument.
 146         '''
 147         # do the index lookup
 148         hits = self.find(search_terms)
 149         if not hits:
 150             return {}
 151
 152         designator_propname = {}
 153         for nm, propclass in klass.getprops().items():
 154             if isinstance(propclass, Link) or isinstance(propclass, Multilink):
 155                 designator_propname[propclass.classname] = nm
 156
 157         # build a dictionary of nodes and their associated messages
 158         # and files
 159         nodeids = {}      # this is the answer
 160         propspec = {}     # used to do the klass.find
 161         for propname in designator_propname.values():
 162             propspec[propname] = {}   # used as a set (value doesn't matter)
 163         for classname, nodeid, property in hits.values():
 164             # skip this result if we don't care about this class/property
 165             if ignore.has_key((classname, property)):
 166                 continue
 167
 168             # if it's a property on klass, it's easy
 169             if classname == klass.classname:
 170                 if not nodeids.has_key(nodeid):
 171                     nodeids[nodeid] = {}
 172                 continue
 173
 174             # make sure the class is a linked one, otherwise ignore
 175             if not designator_propname.has_key(classname):
 176                 continue
 177
 178             # it's a linked class - set up to do the klass.find
 179             linkprop = designator_propname[classname]   # eg, msg -> messages
 180             propspec[linkprop][nodeid] = 1
 181
 182         # retain only the meaningful entries
 183         for propname, idset in propspec.items():
 184             if not idset:
 185                 del propspec[propname]
 186
 187         # klass.find tells me the klass nodeids the linked nodes relate to
 188         for resid in klass.find(**propspec):
 189             resid = str(resid)
 190             if not nodeids.has_key(id):
 191                 nodeids[resid] = {}
 192             node_dict = nodeids[resid]
 193             # now figure out where it came from
 194             for linkprop in propspec.keys():
 195                 for nodeid in klass.get(resid, linkprop):
 196                     if propspec[linkprop].has_key(nodeid):
 197                         # OK, this node[propname] has a winner
 198                         if not node_dict.has_key(linkprop):
 199                             node_dict[linkprop] = [nodeid]
 200                         else:
 201                             node_dict[linkprop].append(nodeid)
 202         return nodeids
 203
 204     # we override this to ignore not 2 < word < 25 and also to fix a bug -
 205     # the (fail) case.
 206     def find(self, wordlist):
 207         '''Locate files that match ALL the words in wordlist
 208         '''
 209         if not hasattr(self, 'words'):
 210             self.load_index()
 211         self.load_index(wordlist=wordlist)
 212         entries = {}
 213         hits = None
 214         for word in wordlist:
 215             if not 2 < len(word) < 25:
 216                 # word outside the bounds of what we index - ignore
 217                 continue
 218             word = word.upper()
 219             entry = self.words.get(word)    # For each word, get index
 220             entries[word] = entry           #   of matching files
 221             if not entry:                   # Nothing for this one word (fail)
 222                 return {}
 223             if hits is None:
 224                 hits = {}
 225                 for k in entry.keys():
 226                     if not self.fileids.has_key(k):
 227                         raise ValueError, 'Index is corrupted: re-generate it'
 228                     hits[k] = self.fileids[k]
 229             else:
 230                 # Eliminate hits for every non-match
 231                 for fileid in hits.keys():
 232                     if not entry.has_key(fileid):
 233                         del hits[fileid]
 234         if hits is None:
 235             return {}
 236         return hits
 237
 238     segments = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_-!"
 239     def load_index(self, reload=0, wordlist=None):
 240         # Unless reload is indicated, do not load twice
 241         if self.index_loaded() and not reload:
 242             return 0
 243
 244         # Ok, now let's actually load it
 245         db = {'WORDS': {}, 'FILES': {'_TOP':(0,None)}, 'FILEIDS': {}}
 246
 247         # Identify the relevant word-dictionary segments
 248         if not wordlist:
 249             segments = self.segments
 250         else:
 251             segments = ['-','#']
 252             for word in wordlist:
 253                 segments.append(word[0].upper())
 254
 255         # Load the segments
 256         for segment in segments:
 257             try:
 258                 f = open(self.indexdb + segment, 'rb')
 259             except IOError, error:
 260                 # probably just nonexistent segment index file
 261                 if error.errno != errno.ENOENT: raise
 262             else:
 263                 pickle_str = zlib.decompress(f.read())
 264                 f.close()
 265                 dbslice = marshal.loads(pickle_str)
 266                 if dbslice.get('WORDS'):
 267                     # if it has some words, add them
 268                     for word, entry in dbslice['WORDS'].items():
 269                         db['WORDS'][word] = entry
 270                 if dbslice.get('FILES'):
 271                     # if it has some files, add them
 272                     db['FILES'] = dbslice['FILES']
 273                 if dbslice.get('FILEIDS'):
 274                     # if it has fileids, add them
 275                     db['FILEIDS'] = dbslice['FILEIDS']
 276
 277         self.words = db['WORDS']
 278         self.files = db['FILES']
 279         self.fileids = db['FILEIDS']
 280         self.changed = 0
 281
 282     def save_index(self):
 283         # only save if the index is loaded and changed
 284         if not self.index_loaded() or not self.changed:
 285             return
 286
 287         # brutal space saver... delete all the small segments
 288         for segment in self.segments:
 289             try:
 290                 os.remove(self.indexdb + segment)
 291             except OSError, error:
 292                 # probably just nonexistent segment index file
 293                 if error.errno != errno.ENOENT: raise
 294
 295         # First write the much simpler filename/fileid dictionaries
 296         dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids}
 297         open(self.indexdb+'-','wb').write(zlib.compress(marshal.dumps(dbfil)))
 298
 299         # The hard part is splitting the word dictionary up, of course
 300         letters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_"
 301         segdicts = {}                           # Need batch of empty dicts
 302         for segment in letters:
 303             segdicts[segment] = {}
 304         for word, entry in self.words.items():  # Split into segment dicts
 305             initchar = word[0].upper()
 306             segdicts[initchar][word] = entry
 307
 308         # save
 309         for initchar in letters:
 310             db = {'WORDS':segdicts[initchar], 'FILES':None, 'FILEIDS':None}
 311             pickle_str = marshal.dumps(db)
 312             filename = self.indexdb + initchar
 313             pickle_fh = open(filename, 'wb')
 314             pickle_fh.write(zlib.compress(pickle_str))
 315             os.chmod(filename, 0664)
 316
 317         # save done
 318         self.changed = 0
 319
 320     def purge_entry(self, identifier):
 321         '''Remove a file from file index and word index
 322         '''
 323         self.load_index()
 324
 325         if not self.files.has_key(identifier):
 326             return
 327
 328         file_index = self.files[identifier][0]
 329         del self.files[identifier]
 330         del self.fileids[file_index]
 331
 332         # The much harder part, cleanup the word index
 333         for key, occurs in self.words.items():
 334             if occurs.has_key(file_index):
 335                 del occurs[file_index]
 336
 337         # save needed
 338         self.changed = 1
 339
 340     def index_loaded(self):
 341         return (hasattr(self,'fileids') and hasattr(self,'files') and
 342             hasattr(self,'words'))
 343
 344 # vim: set filetype=python ts=4 sw=4 et si