roundup/backends/indexer_dbm.py

   1 #
   2 # This module is derived from the module described at:
   3 #   http://gnosis.cx/publish/programming/charming_python_15.txt
   4 #
   5 # Author: David Mertz (mertz@gnosis.cx)
   6 # Thanks to: Pat Knight (p.knight@ktgroup.co.uk)
   7 #            Gregory Popovitch (greg@gpy.com)
   8 #
   9 # The original module was released under this license, and remains under
  10 # it:
  11 #
  12 #     This file is released to the public domain.  I (dqm) would
  13 #     appreciate it if you choose to keep derived works under terms
  14 #     that promote freedom, but obviously am giving up any rights
  15 #     to compel such.
  16 #
  17 #$Id: indexer_dbm.py,v 1.9 2006-04-27 05:48:26 richard Exp $
  18 '''This module provides an indexer class, RoundupIndexer, that stores text
  19 indices in a roundup instance.  This class makes searching the content of
  20 messages, string properties and text files possible.
  21 '''
  22 __docformat__ = 'restructuredtext'
  23
  24 import os, shutil, re, mimetypes, marshal, zlib, errno
  25 from roundup.hyperdb import Link, Multilink
  26 from roundup.backends.indexer_common import Indexer as IndexerBase
  27
  28 class Indexer(IndexerBase):
  29     '''Indexes information from roundup's hyperdb to allow efficient
  30     searching.
  31
  32     Three structures are created by the indexer::
  33
  34           files   {identifier: (fileid, wordcount)}
  35           words   {word: {fileid: count}}
  36           fileids {fileid: identifier}
  37
  38     where identifier is (classname, nodeid, propertyname)
  39     '''
  40     def __init__(self, db):
  41         IndexerBase.__init__(self, db)
  42         self.indexdb_path = os.path.join(db.config.DATABASE, 'indexes')
  43         self.indexdb = os.path.join(self.indexdb_path, 'index.db')
  44         self.reindex = 0
  45         self.quiet = 9
  46         self.changed = 0
  47
  48         # see if we need to reindex because of a change in code
  49         version = os.path.join(self.indexdb_path, 'version')
  50         if (not os.path.exists(self.indexdb_path) or
  51                 not os.path.exists(version)):
  52             # for now the file itself is a flag
  53             self.force_reindex()
  54         elif os.path.exists(version):
  55             version = open(version).read()
  56             # check the value and reindex if it's not the latest
  57             if version.strip() != '1':
  58                 self.force_reindex()
  59
  60     def force_reindex(self):
  61         '''Force a reindex condition
  62         '''
  63         if os.path.exists(self.indexdb_path):
  64             shutil.rmtree(self.indexdb_path)
  65         os.makedirs(self.indexdb_path)
  66         os.chmod(self.indexdb_path, 0775)
  67         open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n')
  68         self.reindex = 1
  69         self.changed = 1
  70
  71     def should_reindex(self):
  72         '''Should we reindex?
  73         '''
  74         return self.reindex
  75
  76     def add_text(self, identifier, text, mime_type='text/plain'):
  77         '''Add some text associated with the (classname, nodeid, property)
  78         identifier.
  79         '''
  80         # make sure the index is loaded
  81         self.load_index()
  82
  83         # remove old entries for this identifier
  84         if self.files.has_key(identifier):
  85             self.purge_entry(identifier)
  86
  87         # split into words
  88         words = self.splitter(text, mime_type)
  89
  90         # Find new file index, and assign it to identifier
  91         # (_TOP uses trick of negative to avoid conflict with file index)
  92         self.files['_TOP'] = (self.files['_TOP'][0]-1, None)
  93         file_index = abs(self.files['_TOP'][0])
  94         self.files[identifier] = (file_index, len(words))
  95         self.fileids[file_index] = identifier
  96
  97         # find the unique words
  98         filedict = {}
  99         for word in words:
 100             if self.is_stopword(word):
 101                 continue
 102             if filedict.has_key(word):
 103                 filedict[word] = filedict[word]+1
 104             else:
 105                 filedict[word] = 1
 106
 107         # now add to the totals
 108         for word in filedict.keys():
 109             # each word has a dict of {identifier: count}
 110             if self.words.has_key(word):
 111                 entry = self.words[word]
 112             else:
 113                 # new word
 114                 entry = {}
 115                 self.words[word] = entry
 116
 117             # make a reference to the file for this word
 118             entry[file_index] = filedict[word]
 119
 120         # save needed
 121         self.changed = 1
 122
 123     def splitter(self, text, ftype):
 124         '''Split the contents of a text string into a list of 'words'
 125         '''
 126         if ftype == 'text/plain':
 127             words = self.text_splitter(text)
 128         else:
 129             return []
 130         return words
 131
 132     def text_splitter(self, text):
 133         """Split text/plain string into a list of words
 134         """
 135         # case insensitive
 136         text = str(text).upper()
 137
 138         # Split the raw text
 139         return re.findall(r'\b\w{%d,%d}\b' % (self.minlength, self.maxlength),
 140                           text)
 141
 142     # we override this to ignore too short and too long words
 143     # and also to fix a bug - the (fail) case.
 144     def find(self, wordlist):
 145         '''Locate files that match ALL the words in wordlist
 146         '''
 147         if not hasattr(self, 'words'):
 148             self.load_index()
 149         self.load_index(wordlist=wordlist)
 150         entries = {}
 151         hits = None
 152         for word in wordlist:
 153             if not self.minlength <= len(word) <= self.maxlength:
 154                 # word outside the bounds of what we index - ignore
 155                 continue
 156             word = word.upper()
 157             if self.is_stopword(word):
 158                 continue
 159             entry = self.words.get(word)    # For each word, get index
 160             entries[word] = entry           #   of matching files
 161             if not entry:                   # Nothing for this one word (fail)
 162                 return {}
 163             if hits is None:
 164                 hits = {}
 165                 for k in entry.keys():
 166                     if not self.fileids.has_key(k):
 167                         raise ValueError, 'Index is corrupted: re-generate it'
 168                     hits[k] = self.fileids[k]
 169             else:
 170                 # Eliminate hits for every non-match
 171                 for fileid in hits.keys():
 172                     if not entry.has_key(fileid):
 173                         del hits[fileid]
 174         if hits is None:
 175             return {}
 176         return hits.values()
 177
 178     segments = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_-!"
 179     def load_index(self, reload=0, wordlist=None):
 180         # Unless reload is indicated, do not load twice
 181         if self.index_loaded() and not reload:
 182             return 0
 183
 184         # Ok, now let's actually load it
 185         db = {'WORDS': {}, 'FILES': {'_TOP':(0,None)}, 'FILEIDS': {}}
 186
 187         # Identify the relevant word-dictionary segments
 188         if not wordlist:
 189             segments = self.segments
 190         else:
 191             segments = ['-','#']
 192             for word in wordlist:
 193                 segments.append(word[0].upper())
 194
 195         # Load the segments
 196         for segment in segments:
 197             try:
 198                 f = open(self.indexdb + segment, 'rb')
 199             except IOError, error:
 200                 # probably just nonexistent segment index file
 201                 if error.errno != errno.ENOENT: raise
 202             else:
 203                 pickle_str = zlib.decompress(f.read())
 204                 f.close()
 205                 dbslice = marshal.loads(pickle_str)
 206                 if dbslice.get('WORDS'):
 207                     # if it has some words, add them
 208                     for word, entry in dbslice['WORDS'].items():
 209                         db['WORDS'][word] = entry
 210                 if dbslice.get('FILES'):
 211                     # if it has some files, add them
 212                     db['FILES'] = dbslice['FILES']
 213                 if dbslice.get('FILEIDS'):
 214                     # if it has fileids, add them
 215                     db['FILEIDS'] = dbslice['FILEIDS']
 216
 217         self.words = db['WORDS']
 218         self.files = db['FILES']
 219         self.fileids = db['FILEIDS']
 220         self.changed = 0
 221
 222     def save_index(self):
 223         # only save if the index is loaded and changed
 224         if not self.index_loaded() or not self.changed:
 225             return
 226
 227         # brutal space saver... delete all the small segments
 228         for segment in self.segments:
 229             try:
 230                 os.remove(self.indexdb + segment)
 231             except OSError, error:
 232                 # probably just nonexistent segment index file
 233                 if error.errno != errno.ENOENT: raise
 234
 235         # First write the much simpler filename/fileid dictionaries
 236         dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids}
 237         open(self.indexdb+'-','wb').write(zlib.compress(marshal.dumps(dbfil)))
 238
 239         # The hard part is splitting the word dictionary up, of course
 240         letters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_"
 241         segdicts = {}                           # Need batch of empty dicts
 242         for segment in letters:
 243             segdicts[segment] = {}
 244         for word, entry in self.words.items():  # Split into segment dicts
 245             initchar = word[0].upper()
 246             segdicts[initchar][word] = entry
 247
 248         # save
 249         for initchar in letters:
 250             db = {'WORDS':segdicts[initchar], 'FILES':None, 'FILEIDS':None}
 251             pickle_str = marshal.dumps(db)
 252             filename = self.indexdb + initchar
 253             pickle_fh = open(filename, 'wb')
 254             pickle_fh.write(zlib.compress(pickle_str))
 255             os.chmod(filename, 0664)
 256
 257         # save done
 258         self.changed = 0
 259
 260     def purge_entry(self, identifier):
 261         '''Remove a file from file index and word index
 262         '''
 263         self.load_index()
 264
 265         if not self.files.has_key(identifier):
 266             return
 267
 268         file_index = self.files[identifier][0]
 269         del self.files[identifier]
 270         del self.fileids[file_index]
 271
 272         # The much harder part, cleanup the word index
 273         for key, occurs in self.words.items():
 274             if occurs.has_key(file_index):
 275                 del occurs[file_index]
 276
 277         # save needed
 278         self.changed = 1
 279
 280     def index_loaded(self):
 281         return (hasattr(self,'fileids') and hasattr(self,'files') and
 282             hasattr(self,'words'))
 283
 284     def rollback(self):
 285         ''' load last saved index info. '''
 286         self.load_index(reload=1)
 287
 288     def close(self):
 289         pass
 290
 291
 292 # vim: set filetype=python ts=4 sw=4 et si