roundup/indexer.py

   1 #
   2 # This module is derived from the module described at:
   3 #   http://gnosis.cx/publish/programming/charming_python_15.txt
   4 #
   5 # Author: David Mertz (mertz@gnosis.cx)
   6 # Thanks to: Pat Knight (p.knight@ktgroup.co.uk)
   7 #            Gregory Popovitch (greg@gpy.com)
   8 #
   9 # The original module was released under this license, and remains under
  10 # it:
  11 #
  12 #     This file is released to the public domain.  I (dqm) would
  13 #     appreciate it if you choose to keep derived works under terms
  14 #     that promote freedom, but obviously am giving up any rights
  15 #     to compel such.
  16 #
  17 #$Id: indexer.py,v 1.4 2002-07-09 03:02:52 richard Exp $
  18 '''
  19 This module provides an indexer class, RoundupIndexer, that stores text
  20 indices in a roundup instance.  This class makes searching the content of
  21 messages and text files possible.
  22 '''
  23 import os, shutil, re, mimetypes, marshal, zlib, errno
  24
  25 class Indexer:
  26     ''' Indexes information from roundup's hyperdb to allow efficient
  27         searching.
  28     '''
  29     def __init__(self, db_path):
  30         indexdb_path = os.path.join(db_path, 'indexes')
  31         self.indexdb = os.path.join(indexdb_path, 'index.db')
  32         self.reindex = 0
  33         self.casesensitive = 0
  34         self.quiet = 9
  35
  36         # see if we need to reindex because of a change in code
  37         if (not os.path.exists(indexdb_path) or
  38                 not os.path.exists(os.path.join(indexdb_path, 'version'))):
  39             # TODO: if the version file exists (in the future) we'll want to
  40             # check the value in it - for now the file itself is a flag
  41             if os.path.exists(indexdb_path):
  42                 shutil.rmtree(indexdb_path)
  43             os.makedirs(indexdb_path)
  44             os.chmod(indexdb_path, 0775)
  45             open(os.path.join(indexdb_path, 'version'), 'w').write('1\n')
  46
  47             # we need to reindex
  48             self.reindex = 1
  49         else:
  50             self.reindex = 0
  51
  52     def should_reindex(self):
  53         '''Should we reindex?
  54         '''
  55         return self.reindex
  56
  57     def add_text(self, identifier, text, mime_type='text/plain'):
  58         ''' Add some text associated with the (classname, nodeid, property)
  59             identifier.
  60         '''
  61         # make sure the index is loaded
  62         self.load_index()
  63
  64         # Is file eligible for (re)indexing?
  65         if self.files.has_key(identifier):
  66             # Reindexing enabled, cleanup dicts
  67             if self.reindex:
  68                 self.purge_entry(identifier, self.files, self.words)
  69             else:
  70                 # DO NOT reindex this file
  71                 if self.quiet < 5:
  72                     print "Not reindexing", identifier
  73                 return 0
  74
  75         # split into words
  76         words = self.splitter(text, mime_type)
  77
  78         # Find new file index, and assign it to identifier
  79         # (_TOP uses trick of negative to avoid conflict with file index)
  80         self.files['_TOP'] = (self.files['_TOP'][0]-1, None)
  81         file_index = abs(self.files['_TOP'][0])
  82         self.files[identifier] = (file_index, len(words))
  83         self.fileids[file_index] = identifier
  84
  85         # find the unique words
  86         filedict = {}
  87         for word in words:
  88             if filedict.has_key(word):
  89                 filedict[word] = filedict[word]+1
  90             else:
  91                 filedict[word] = 1
  92
  93         # now add to the totals
  94         for word in filedict.keys():
  95             # each word has a dict of {identifier: count}
  96             if self.words.has_key(word):
  97                 entry = self.words[word]
  98             else:
  99                 # new word
 100                 entry = {}
 101                 self.words[word] = entry
 102
 103             # make a reference to the file for this word
 104             entry[file_index] = filedict[word]
 105
 106     def splitter(self, text, ftype):
 107         ''' Split the contents of a text string into a list of 'words'
 108         '''
 109         if ftype == 'text/plain':
 110             words = self.text_splitter(text, self.casesensitive)
 111         else:
 112             return []
 113         return words
 114
 115     def text_splitter(self, text, casesensitive=0):
 116         """Split text/plain string into a list of words
 117         """
 118         # Let's adjust case if not case-sensitive
 119         if not casesensitive:
 120             text = text.upper()
 121
 122         # Split the raw text, losing anything longer than 25 characters
 123         # since that'll be gibberish (encoded text or somesuch) or shorter
 124         # than 3 characters since those short words appear all over the
 125         # place
 126         return re.findall(r'\b\w{2,25}\b', text)
 127
 128     def search(self, search_terms, klass, ignore={},
 129             dre=re.compile(r'([^\d]+)(\d+)')):
 130         ''' Display search results looking for [search, terms] associated
 131             with the hyperdb Class "klass". Ignore hits on {class: property}.
 132
 133             "dre" is a helper, not an argument.
 134         '''
 135         # do the index lookup
 136         hits = self.find(search_terms)
 137         if not hits:
 138             return {}
 139
 140         # this is specific to "issue" klass ... eugh
 141         designator_propname = {'msg': 'messages', 'file': 'files'}
 142
 143         # build a dictionary of nodes and their associated messages
 144         # and files
 145         nodeids = {}
 146         for classname, nodeid, property in hits.values():
 147             # skip this result if we don't care about this class/property
 148             if ignore.has_key((classname, property)):
 149                 continue
 150
 151             # if it's a property on klass, it's easy
 152             if classname == klass.classname:
 153                 if not nodeids.has_key(nodeid):
 154                     nodeids[nodeid] = {}
 155                 continue
 156
 157             # it's a linked class - find the klass entries that are
 158             # linked to it
 159             linkprop = designator_propname[classname]
 160             for resid in klass.find(**{linkprop: nodeid}):
 161                 resid = str(resid)
 162                 if not nodeids.has_key(id):
 163                     nodeids[resid] = {}
 164
 165                 # update the links for this klass nodeid
 166                 node_dict = nodeids[resid]
 167                 if not node_dict.has_key(linkprop):
 168                     node_dict[linkprop] = [nodeid]
 169                 elif node_dict.has_key(linkprop):
 170                     node_dict[linkprop].append(nodeid)
 171         return nodeids
 172
 173     # we override this to ignore not 2 < word < 25 and also to fix a bug -
 174     # the (fail) case.
 175     def find(self, wordlist):
 176         ''' Locate files that match ALL the words in wordlist
 177         '''
 178         if not hasattr(self, 'words'):
 179             self.load_index()
 180         self.load_index(wordlist=wordlist)
 181         entries = {}
 182         hits = None
 183         for word in wordlist:
 184             if not 2 < len(word) < 25:
 185                 # word outside the bounds of what we index - ignore
 186                 continue
 187             if not self.casesensitive:
 188                 word = word.upper()
 189             entry = self.words.get(word)    # For each word, get index
 190             entries[word] = entry           #   of matching files
 191             if not entry:                   # Nothing for this one word (fail)
 192                 return {}
 193             if hits is None:
 194                 hits = {}
 195                 for k in entry.keys():
 196                     hits[k] = self.fileids[k]
 197             else:
 198                 # Eliminate hits for every non-match
 199                 for fileid in hits.keys():
 200                     if not entry.has_key(fileid):
 201                         del hits[fileid]
 202         if hits is None:
 203             return {}
 204         return hits
 205
 206     segments = "ABCDEFGHIJKLMNOPQRSTUVWXYZ#-!"
 207     def load_index(self, reload=0, wordlist=None):
 208         # Unless reload is indicated, do not load twice
 209         if self.index_loaded() and not reload:
 210             return 0
 211
 212         # Ok, now let's actually load it
 213         db = {'WORDS': {}, 'FILES': {'_TOP':(0,None)}, 'FILEIDS': {}}
 214
 215         # Identify the relevant word-dictionary segments
 216         if not wordlist:
 217             segments = self.segments
 218         else:
 219             segments = ['-','#']
 220             for word in wordlist:
 221                 segments.append(word[0].upper())
 222
 223         # Load the segments
 224         for segment in segments:
 225             try:
 226                 f = open(self.indexdb + segment, 'rb')
 227             except IOError, error:
 228                 if error.errno != errno.ENOENT:
 229                     raise
 230             else:
 231                 pickle_str = zlib.decompress(f.read())
 232                 f.close()
 233                 dbslice = marshal.loads(pickle_str)
 234                 if dbslice.get('WORDS'):
 235                     # if it has some words, add them
 236                     for word, entry in dbslice['WORDS'].items():
 237                         db['WORDS'][word] = entry
 238                 if dbslice.get('FILES'):
 239                     # if it has some files, add them
 240                     db['FILES'] = dbslice['FILES']
 241                 if dbslice.get('FILEIDS'):
 242                     # if it has fileids, add them
 243                     db['FILEIDS'] = dbslice['FILEIDS']
 244
 245         self.words = db['WORDS']
 246         self.files = db['FILES']
 247         self.fileids = db['FILEIDS']
 248
 249     def save_index(self):
 250         # make sure we're loaded
 251         self.load_index()
 252
 253         # brutal space saver... delete all the small segments
 254         for segment in self.segments:
 255             try:
 256                 os.remove(self.indexdb + segment)
 257             except OSError:
 258                 # probably just nonexistent segment index file
 259                 # TODO: make sure it's an EEXIST
 260                 pass
 261
 262         # First write the much simpler filename/fileid dictionaries
 263         dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids}
 264         open(self.indexdb+'-','wb').write(zlib.compress(marshal.dumps(dbfil)))
 265
 266         # The hard part is splitting the word dictionary up, of course
 267         letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ#"
 268         segdicts = {}                           # Need batch of empty dicts
 269         for segment in letters:
 270             segdicts[segment] = {}
 271         for word, entry in self.words.items():  # Split into segment dicts
 272             initchar = word[0].upper()
 273             segdicts[initchar][word] = entry
 274
 275         # save
 276         for initchar in letters:
 277             db = {'WORDS':segdicts[initchar], 'FILES':None, 'FILEIDS':None}
 278             pickle_str = marshal.dumps(db)
 279             filename = self.indexdb + initchar
 280             pickle_fh = open(filename, 'wb')
 281             pickle_fh.write(zlib.compress(pickle_str))
 282             os.chmod(filename, 0664)
 283
 284     def purge_entry(self, fname, file_dct, word_dct):
 285         ''' Remove a file from file index and word index
 286         '''
 287         try:        # The easy part, cleanup the file index
 288             file_index = file_dct[fname]
 289             del file_dct[fname]
 290         except KeyError:
 291             pass    # We'll assume we only encounter KeyError's
 292         # The much harder part, cleanup the word index
 293         for word, occurs in word_dct.items():
 294             if occurs.has_key(file_index):
 295                 del occurs[file_index]
 296                 word_dct[word] = occurs
 297
 298     def index_loaded(self):
 299         return (hasattr(self,'fileids') and hasattr(self,'files') and
 300             hasattr(self,'words'))
 301
 302 #
 303 #$Log: not supported by cvs2svn $
 304 #Revision 1.3  2002/07/08 06:58:15  richard
 305 #cleaned up the indexer code:
 306 # - it splits more words out (much simpler, faster splitter)
 307 # - removed code we'll never use (roundup.roundup_indexer has the full
 308 #   implementation, and replaces roundup.indexer)
 309 # - only index text/plain and rfc822/message (ideas for other text formats to
 310 #   index are welcome)
 311 # - added simple unit test for indexer. Needs more tests for regression.
 312 #
 313 #Revision 1.2  2002/05/25 07:16:24  rochecompaan
 314 #Merged search_indexing-branch with HEAD
 315 #
 316 #Revision 1.1.2.3  2002/05/02 11:52:12  rochecompaan
 317 #Fixed small bug that prevented indexes from being generated.
 318 #
 319 #Revision 1.1.2.2  2002/04/19 19:54:42  rochecompaan
 320 #cgi_client.py
 321 #    removed search link for the time being
 322 #    moved rendering of matches to htmltemplate
 323 #hyperdb.py
 324 #    filtering of nodes on full text search incorporated in filter method
 325 #roundupdb.py
 326 #    added paramater to call of filter method
 327 #roundup_indexer.py
 328 #    added search method to RoundupIndexer class
 329 #
 330 #Revision 1.1.2.1  2002/04/03 11:55:57  rochecompaan
 331 # . Added feature #526730 - search for messages capability
 332 #