roundup/indexer.py

   1 #
   2 # This module is derived from the module described at:
   3 #   http://gnosis.cx/publish/programming/charming_python_15.txt
   4 #
   5 # Author: David Mertz (mertz@gnosis.cx)
   6 # Thanks to: Pat Knight (p.knight@ktgroup.co.uk)
   7 #            Gregory Popovitch (greg@gpy.com)
   8 #
   9 # The original module was released under this license, and remains under
  10 # it:
  11 #
  12 #     This file is released to the public domain.  I (dqm) would
  13 #     appreciate it if you choose to keep derived works under terms
  14 #     that promote freedom, but obviously am giving up any rights
  15 #     to compel such.
  16 #
  17 #$Id: indexer.py,v 1.7 2002-07-09 21:38:43 richard Exp $
  18 '''
  19 This module provides an indexer class, RoundupIndexer, that stores text
  20 indices in a roundup instance.  This class makes searching the content of
  21 messages and text files possible.
  22 '''
  23 import os, shutil, re, mimetypes, marshal, zlib, errno
  24
  25 class Indexer:
  26     ''' Indexes information from roundup's hyperdb to allow efficient
  27         searching.
  28
  29         Three structures are created by the indexer:
  30           files   {identifier: (fileid, wordcount)}
  31           words   {word: {fileid: count}}
  32           fileids {fileid: identifier}
  33     '''
  34     def __init__(self, db_path):
  35         self.indexdb_path = os.path.join(db_path, 'indexes')
  36         self.indexdb = os.path.join(self.indexdb_path, 'index.db')
  37         self.reindex = 0
  38         self.quiet = 9
  39         self.changed = 0
  40
  41         # see if we need to reindex because of a change in code
  42         if (not os.path.exists(self.indexdb_path) or
  43                 not os.path.exists(os.path.join(self.indexdb_path, 'version'))):
  44             # TODO: if the version file exists (in the future) we'll want to
  45             # check the value in it - for now the file itself is a flag
  46             self.force_reindex()
  47
  48     def force_reindex(self):
  49         '''Force a reindex condition
  50         '''
  51         if os.path.exists(self.indexdb_path):
  52             shutil.rmtree(self.indexdb_path)
  53         os.makedirs(self.indexdb_path)
  54         os.chmod(self.indexdb_path, 0775)
  55         open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n')
  56         self.reindex = 1
  57         self.changed = 1
  58
  59     def should_reindex(self):
  60         '''Should we reindex?
  61         '''
  62         return self.reindex
  63
  64     def add_text(self, identifier, text, mime_type='text/plain'):
  65         ''' Add some text associated with the (classname, nodeid, property)
  66             identifier.
  67         '''
  68         # make sure the index is loaded
  69         self.load_index()
  70
  71         # remove old entries for this identifier
  72         if self.files.has_key(identifier):
  73             self.purge_entry(identifier)
  74
  75         # split into words
  76         words = self.splitter(text, mime_type)
  77
  78         # Find new file index, and assign it to identifier
  79         # (_TOP uses trick of negative to avoid conflict with file index)
  80         self.files['_TOP'] = (self.files['_TOP'][0]-1, None)
  81         file_index = abs(self.files['_TOP'][0])
  82         self.files[identifier] = (file_index, len(words))
  83         self.fileids[file_index] = identifier
  84
  85         # find the unique words
  86         filedict = {}
  87         for word in words:
  88             if filedict.has_key(word):
  89                 filedict[word] = filedict[word]+1
  90             else:
  91                 filedict[word] = 1
  92
  93         # now add to the totals
  94         for word in filedict.keys():
  95             # each word has a dict of {identifier: count}
  96             if self.words.has_key(word):
  97                 entry = self.words[word]
  98             else:
  99                 # new word
 100                 entry = {}
 101                 self.words[word] = entry
 102
 103             # make a reference to the file for this word
 104             entry[file_index] = filedict[word]
 105
 106         # save needed
 107         self.changed = 1
 108
 109     def splitter(self, text, ftype):
 110         ''' Split the contents of a text string into a list of 'words'
 111         '''
 112         if ftype == 'text/plain':
 113             words = self.text_splitter(text)
 114         else:
 115             return []
 116         return words
 117
 118     def text_splitter(self, text):
 119         """Split text/plain string into a list of words
 120         """
 121         # case insensitive
 122         text = text.upper()
 123
 124         # Split the raw text, losing anything longer than 25 characters
 125         # since that'll be gibberish (encoded text or somesuch) or shorter
 126         # than 3 characters since those short words appear all over the
 127         # place
 128         return re.findall(r'\b\w{2,25}\b', text)
 129
 130     def search(self, search_terms, klass, ignore={},
 131             dre=re.compile(r'([^\d]+)(\d+)')):
 132         ''' Display search results looking for [search, terms] associated
 133             with the hyperdb Class "klass". Ignore hits on {class: property}.
 134
 135             "dre" is a helper, not an argument.
 136         '''
 137         # do the index lookup
 138         hits = self.find(search_terms)
 139         if not hits:
 140             return {}
 141
 142         # this is specific to "issue" klass ... eugh
 143         designator_propname = {'msg': 'messages', 'file': 'files'}
 144
 145         # build a dictionary of nodes and their associated messages
 146         # and files
 147         nodeids = {}
 148         for classname, nodeid, property in hits.values():
 149             # skip this result if we don't care about this class/property
 150             if ignore.has_key((classname, property)):
 151                 continue
 152
 153             # if it's a property on klass, it's easy
 154             if classname == klass.classname:
 155                 if not nodeids.has_key(nodeid):
 156                     nodeids[nodeid] = {}
 157                 continue
 158
 159             # it's a linked class - find the klass entries that are
 160             # linked to it
 161             linkprop = designator_propname[classname]
 162             for resid in klass.find(**{linkprop: nodeid}):
 163                 resid = str(resid)
 164                 if not nodeids.has_key(id):
 165                     nodeids[resid] = {}
 166
 167                 # update the links for this klass nodeid
 168                 node_dict = nodeids[resid]
 169                 if not node_dict.has_key(linkprop):
 170                     node_dict[linkprop] = [nodeid]
 171                 elif node_dict.has_key(linkprop):
 172                     node_dict[linkprop].append(nodeid)
 173         return nodeids
 174
 175     # we override this to ignore not 2 < word < 25 and also to fix a bug -
 176     # the (fail) case.
 177     def find(self, wordlist):
 178         ''' Locate files that match ALL the words in wordlist
 179         '''
 180         if not hasattr(self, 'words'):
 181             self.load_index()
 182         self.load_index(wordlist=wordlist)
 183         entries = {}
 184         hits = None
 185         for word in wordlist:
 186             if not 2 < len(word) < 25:
 187                 # word outside the bounds of what we index - ignore
 188                 continue
 189             word = word.upper()
 190             entry = self.words.get(word)    # For each word, get index
 191             entries[word] = entry           #   of matching files
 192             if not entry:                   # Nothing for this one word (fail)
 193                 return {}
 194             if hits is None:
 195                 hits = {}
 196                 for k in entry.keys():
 197                     hits[k] = self.fileids[k]
 198             else:
 199                 # Eliminate hits for every non-match
 200                 for fileid in hits.keys():
 201                     if not entry.has_key(fileid):
 202                         del hits[fileid]
 203         if hits is None:
 204             return {}
 205         return hits
 206
 207     segments = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_-!"
 208     def load_index(self, reload=0, wordlist=None):
 209         # Unless reload is indicated, do not load twice
 210         if self.index_loaded() and not reload:
 211             return 0
 212
 213         # Ok, now let's actually load it
 214         db = {'WORDS': {}, 'FILES': {'_TOP':(0,None)}, 'FILEIDS': {}}
 215
 216         # Identify the relevant word-dictionary segments
 217         if not wordlist:
 218             segments = self.segments
 219         else:
 220             segments = ['-','#']
 221             for word in wordlist:
 222                 segments.append(word[0].upper())
 223
 224         # Load the segments
 225         for segment in segments:
 226             try:
 227                 f = open(self.indexdb + segment, 'rb')
 228             except IOError, error:
 229                 if error.errno != errno.ENOENT:
 230                     raise
 231             else:
 232                 pickle_str = zlib.decompress(f.read())
 233                 f.close()
 234                 dbslice = marshal.loads(pickle_str)
 235                 if dbslice.get('WORDS'):
 236                     # if it has some words, add them
 237                     for word, entry in dbslice['WORDS'].items():
 238                         db['WORDS'][word] = entry
 239                 if dbslice.get('FILES'):
 240                     # if it has some files, add them
 241                     db['FILES'] = dbslice['FILES']
 242                 if dbslice.get('FILEIDS'):
 243                     # if it has fileids, add them
 244                     db['FILEIDS'] = dbslice['FILEIDS']
 245
 246         self.words = db['WORDS']
 247         self.files = db['FILES']
 248         self.fileids = db['FILEIDS']
 249         self.changed = 0
 250
 251     def save_index(self):
 252         # only save if the index is loaded and changed
 253         if not self.index_loaded() or not self.changed:
 254             return
 255
 256         # brutal space saver... delete all the small segments
 257         for segment in self.segments:
 258             try:
 259                 os.remove(self.indexdb + segment)
 260             except OSError:
 261                 # probably just nonexistent segment index file
 262                 # TODO: make sure it's an EEXIST
 263                 pass
 264
 265         # First write the much simpler filename/fileid dictionaries
 266         dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids}
 267         open(self.indexdb+'-','wb').write(zlib.compress(marshal.dumps(dbfil)))
 268
 269         # The hard part is splitting the word dictionary up, of course
 270         letters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ#_"
 271         segdicts = {}                           # Need batch of empty dicts
 272         for segment in letters:
 273             segdicts[segment] = {}
 274         for word, entry in self.words.items():  # Split into segment dicts
 275             initchar = word[0].upper()
 276             segdicts[initchar][word] = entry
 277
 278         # save
 279         for initchar in letters:
 280             db = {'WORDS':segdicts[initchar], 'FILES':None, 'FILEIDS':None}
 281             pickle_str = marshal.dumps(db)
 282             filename = self.indexdb + initchar
 283             pickle_fh = open(filename, 'wb')
 284             pickle_fh.write(zlib.compress(pickle_str))
 285             os.chmod(filename, 0664)
 286
 287         # save done
 288         self.changed = 0
 289
 290     def purge_entry(self, identifier):
 291         ''' Remove a file from file index and word index
 292         '''
 293         if not self.files.has_key(identifier):
 294             return
 295
 296         file_index = self.files[identifier][0]
 297         del self.files[identifier]
 298         del self.fileids[file_index]
 299
 300         # The much harder part, cleanup the word index
 301         for key, occurs in self.words.items():
 302             if occurs.has_key(file_index):
 303                 del occurs[file_index]
 304
 305         # save needed
 306         self.changed = 1
 307
 308     def index_loaded(self):
 309         return (hasattr(self,'fileids') and hasattr(self,'files') and
 310             hasattr(self,'words'))
 311
 312 #
 313 #$Log: not supported by cvs2svn $
 314 #Revision 1.6  2002/07/09 04:26:44  richard
 315 #We're indexing numbers now, and _underscore words
 316 #
 317 #Revision 1.5  2002/07/09 04:19:09  richard
 318 #Added reindex command to roundup-admin.
 319 #Fixed reindex on first access.
 320 #Also fixed reindexing of entries that change.
 321 #
 322 #Revision 1.4  2002/07/09 03:02:52  richard
 323 #More indexer work:
 324 #- all String properties may now be indexed too. Currently there's a bit of
 325 #  "issue" specific code in the actual searching which needs to be
 326 #  addressed. In a nutshell:
 327 #  + pass 'indexme="yes"' as a String() property initialisation arg, eg:
 328 #        file = FileClass(db, "file", name=String(), type=String(),
 329 #            comment=String(indexme="yes"))
 330 #  + the comment will then be indexed and be searchable, with the results
 331 #    related back to the issue that the file is linked to
 332 #- as a result of this work, the FileClass has a default MIME type that may
 333 #  be overridden in a subclass, or by the use of a "type" property as is
 334 #  done in the default templates.
 335 #- the regeneration of the indexes (if necessary) is done once the schema is
 336 #  set up in the dbinit.
 337 #
 338 #Revision 1.3  2002/07/08 06:58:15  richard
 339 #cleaned up the indexer code:
 340 # - it splits more words out (much simpler, faster splitter)
 341 # - removed code we'll never use (roundup.roundup_indexer has the full
 342 #   implementation, and replaces roundup.indexer)
 343 # - only index text/plain and rfc822/message (ideas for other text formats to
 344 #   index are welcome)
 345 # - added simple unit test for indexer. Needs more tests for regression.
 346 #
 347 #Revision 1.2  2002/05/25 07:16:24  rochecompaan
 348 #Merged search_indexing-branch with HEAD
 349 #
 350 #Revision 1.1.2.3  2002/05/02 11:52:12  rochecompaan
 351 #Fixed small bug that prevented indexes from being generated.
 352 #
 353 #Revision 1.1.2.2  2002/04/19 19:54:42  rochecompaan
 354 #cgi_client.py
 355 #    removed search link for the time being
 356 #    moved rendering of matches to htmltemplate
 357 #hyperdb.py
 358 #    filtering of nodes on full text search incorporated in filter method
 359 #roundupdb.py
 360 #    added paramater to call of filter method
 361 #roundup_indexer.py
 362 #    added search method to RoundupIndexer class
 363 #
 364 #Revision 1.1.2.1  2002/04/03 11:55:57  rochecompaan
 365 # . Added feature #526730 - search for messages capability
 366 #