roundup/indexer.py

   1 #
   2 # This module is derived from the module described at:
   3 #   http://gnosis.cx/publish/programming/charming_python_15.txt
   4 #
   5 # Author: David Mertz (mertz@gnosis.cx)
   6 # Thanks to: Pat Knight (p.knight@ktgroup.co.uk)
   7 #            Gregory Popovitch (greg@gpy.com)
   8 #
   9 # The original module was released under this license, and remains under
  10 # it:
  11 #
  12 #     This file is released to the public domain.  I (dqm) would
  13 #     appreciate it if you choose to keep derived works under terms
  14 #     that promote freedom, but obviously am giving up any rights
  15 #     to compel such.
  16 #
  17 #$Id: indexer.py,v 1.5 2002-07-09 04:19:09 richard Exp $
  18 '''
  19 This module provides an indexer class, RoundupIndexer, that stores text
  20 indices in a roundup instance.  This class makes searching the content of
  21 messages and text files possible.
  22 '''
  23 import os, shutil, re, mimetypes, marshal, zlib, errno
  24
  25 class Indexer:
  26     ''' Indexes information from roundup's hyperdb to allow efficient
  27         searching.
  28
  29         Three structures are created by the indexer:
  30           files   {identifier: (fileid, wordcount)}
  31           words   {word: {fileid: count}}
  32           fileids {fileid: identifier}
  33     '''
  34     def __init__(self, db_path):
  35         self.indexdb_path = os.path.join(db_path, 'indexes')
  36         self.indexdb = os.path.join(self.indexdb_path, 'index.db')
  37         self.reindex = 0
  38         self.casesensitive = 0
  39         self.quiet = 9
  40
  41         # see if we need to reindex because of a change in code
  42         if (not os.path.exists(self.indexdb_path) or
  43                 not os.path.exists(os.path.join(self.indexdb_path, 'version'))):
  44             # TODO: if the version file exists (in the future) we'll want to
  45             # check the value in it - for now the file itself is a flag
  46             self.force_reindex()
  47
  48     def force_reindex(self):
  49         '''Force a reindex condition
  50         '''
  51         if os.path.exists(self.indexdb_path):
  52             shutil.rmtree(self.indexdb_path)
  53         os.makedirs(self.indexdb_path)
  54         os.chmod(self.indexdb_path, 0775)
  55         open(os.path.join(self.indexdb_path, 'version'), 'w').write('1\n')
  56         self.reindex = 1
  57
  58     def should_reindex(self):
  59         '''Should we reindex?
  60         '''
  61         return self.reindex
  62
  63     def add_text(self, identifier, text, mime_type='text/plain'):
  64         ''' Add some text associated with the (classname, nodeid, property)
  65             identifier.
  66         '''
  67         # make sure the index is loaded
  68         self.load_index()
  69
  70         # remove old entries for this identifier
  71         if self.files.has_key(identifier):
  72             self.purge_entry(identifier)
  73
  74         # split into words
  75         words = self.splitter(text, mime_type)
  76
  77         # Find new file index, and assign it to identifier
  78         # (_TOP uses trick of negative to avoid conflict with file index)
  79         self.files['_TOP'] = (self.files['_TOP'][0]-1, None)
  80         file_index = abs(self.files['_TOP'][0])
  81         self.files[identifier] = (file_index, len(words))
  82         self.fileids[file_index] = identifier
  83
  84         # find the unique words
  85         filedict = {}
  86         for word in words:
  87             if filedict.has_key(word):
  88                 filedict[word] = filedict[word]+1
  89             else:
  90                 filedict[word] = 1
  91
  92         # now add to the totals
  93         for word in filedict.keys():
  94             # each word has a dict of {identifier: count}
  95             if self.words.has_key(word):
  96                 entry = self.words[word]
  97             else:
  98                 # new word
  99                 entry = {}
 100                 self.words[word] = entry
 101
 102             # make a reference to the file for this word
 103             entry[file_index] = filedict[word]
 104
 105     def splitter(self, text, ftype):
 106         ''' Split the contents of a text string into a list of 'words'
 107         '''
 108         if ftype == 'text/plain':
 109             words = self.text_splitter(text, self.casesensitive)
 110         else:
 111             return []
 112         return words
 113
 114     def text_splitter(self, text, casesensitive=0):
 115         """Split text/plain string into a list of words
 116         """
 117         # Let's adjust case if not case-sensitive
 118         if not casesensitive:
 119             text = text.upper()
 120
 121         # Split the raw text, losing anything longer than 25 characters
 122         # since that'll be gibberish (encoded text or somesuch) or shorter
 123         # than 3 characters since those short words appear all over the
 124         # place
 125         return re.findall(r'\b\w{2,25}\b', text)
 126
 127     def search(self, search_terms, klass, ignore={},
 128             dre=re.compile(r'([^\d]+)(\d+)')):
 129         ''' Display search results looking for [search, terms] associated
 130             with the hyperdb Class "klass". Ignore hits on {class: property}.
 131
 132             "dre" is a helper, not an argument.
 133         '''
 134         # do the index lookup
 135         hits = self.find(search_terms)
 136         if not hits:
 137             return {}
 138
 139         # this is specific to "issue" klass ... eugh
 140         designator_propname = {'msg': 'messages', 'file': 'files'}
 141
 142         # build a dictionary of nodes and their associated messages
 143         # and files
 144         nodeids = {}
 145         for classname, nodeid, property in hits.values():
 146             # skip this result if we don't care about this class/property
 147             if ignore.has_key((classname, property)):
 148                 continue
 149
 150             # if it's a property on klass, it's easy
 151             if classname == klass.classname:
 152                 if not nodeids.has_key(nodeid):
 153                     nodeids[nodeid] = {}
 154                 continue
 155
 156             # it's a linked class - find the klass entries that are
 157             # linked to it
 158             linkprop = designator_propname[classname]
 159             for resid in klass.find(**{linkprop: nodeid}):
 160                 resid = str(resid)
 161                 if not nodeids.has_key(id):
 162                     nodeids[resid] = {}
 163
 164                 # update the links for this klass nodeid
 165                 node_dict = nodeids[resid]
 166                 if not node_dict.has_key(linkprop):
 167                     node_dict[linkprop] = [nodeid]
 168                 elif node_dict.has_key(linkprop):
 169                     node_dict[linkprop].append(nodeid)
 170         return nodeids
 171
 172     # we override this to ignore not 2 < word < 25 and also to fix a bug -
 173     # the (fail) case.
 174     def find(self, wordlist):
 175         ''' Locate files that match ALL the words in wordlist
 176         '''
 177         if not hasattr(self, 'words'):
 178             self.load_index()
 179         self.load_index(wordlist=wordlist)
 180         entries = {}
 181         hits = None
 182         for word in wordlist:
 183             if not 2 < len(word) < 25:
 184                 # word outside the bounds of what we index - ignore
 185                 continue
 186             if not self.casesensitive:
 187                 word = word.upper()
 188             entry = self.words.get(word)    # For each word, get index
 189             entries[word] = entry           #   of matching files
 190             if not entry:                   # Nothing for this one word (fail)
 191                 return {}
 192             if hits is None:
 193                 hits = {}
 194                 for k in entry.keys():
 195                     hits[k] = self.fileids[k]
 196             else:
 197                 # Eliminate hits for every non-match
 198                 for fileid in hits.keys():
 199                     if not entry.has_key(fileid):
 200                         del hits[fileid]
 201         if hits is None:
 202             return {}
 203         return hits
 204
 205     segments = "ABCDEFGHIJKLMNOPQRSTUVWXYZ#-!"
 206     def load_index(self, reload=0, wordlist=None):
 207         # Unless reload is indicated, do not load twice
 208         if self.index_loaded() and not reload:
 209             return 0
 210
 211         # Ok, now let's actually load it
 212         db = {'WORDS': {}, 'FILES': {'_TOP':(0,None)}, 'FILEIDS': {}}
 213
 214         # Identify the relevant word-dictionary segments
 215         if not wordlist:
 216             segments = self.segments
 217         else:
 218             segments = ['-','#']
 219             for word in wordlist:
 220                 segments.append(word[0].upper())
 221
 222         # Load the segments
 223         for segment in segments:
 224             try:
 225                 f = open(self.indexdb + segment, 'rb')
 226             except IOError, error:
 227                 if error.errno != errno.ENOENT:
 228                     raise
 229             else:
 230                 pickle_str = zlib.decompress(f.read())
 231                 f.close()
 232                 dbslice = marshal.loads(pickle_str)
 233                 if dbslice.get('WORDS'):
 234                     # if it has some words, add them
 235                     for word, entry in dbslice['WORDS'].items():
 236                         db['WORDS'][word] = entry
 237                 if dbslice.get('FILES'):
 238                     # if it has some files, add them
 239                     db['FILES'] = dbslice['FILES']
 240                 if dbslice.get('FILEIDS'):
 241                     # if it has fileids, add them
 242                     db['FILEIDS'] = dbslice['FILEIDS']
 243
 244         self.words = db['WORDS']
 245         self.files = db['FILES']
 246         self.fileids = db['FILEIDS']
 247
 248     def save_index(self):
 249         # make sure we're loaded
 250         self.load_index()
 251
 252         # brutal space saver... delete all the small segments
 253         for segment in self.segments:
 254             try:
 255                 os.remove(self.indexdb + segment)
 256             except OSError:
 257                 # probably just nonexistent segment index file
 258                 # TODO: make sure it's an EEXIST
 259                 pass
 260
 261         # First write the much simpler filename/fileid dictionaries
 262         dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids}
 263         open(self.indexdb+'-','wb').write(zlib.compress(marshal.dumps(dbfil)))
 264
 265         # The hard part is splitting the word dictionary up, of course
 266         letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ#"
 267         segdicts = {}                           # Need batch of empty dicts
 268         for segment in letters:
 269             segdicts[segment] = {}
 270         for word, entry in self.words.items():  # Split into segment dicts
 271             initchar = word[0].upper()
 272             segdicts[initchar][word] = entry
 273
 274         # save
 275         for initchar in letters:
 276             db = {'WORDS':segdicts[initchar], 'FILES':None, 'FILEIDS':None}
 277             pickle_str = marshal.dumps(db)
 278             filename = self.indexdb + initchar
 279             pickle_fh = open(filename, 'wb')
 280             pickle_fh.write(zlib.compress(pickle_str))
 281             os.chmod(filename, 0664)
 282
 283     def purge_entry(self, identifier):
 284         ''' Remove a file from file index and word index
 285         '''
 286         if not self.files.has_key(identifier):
 287             return
 288
 289         file_index = self.files[identifier][0]
 290         del self.files[identifier]
 291         del self.fileids[file_index]
 292
 293         # The much harder part, cleanup the word index
 294         for key, occurs in self.words.items():
 295             if occurs.has_key(file_index):
 296                 del occurs[file_index]
 297
 298     def index_loaded(self):
 299         return (hasattr(self,'fileids') and hasattr(self,'files') and
 300             hasattr(self,'words'))
 301
 302 #
 303 #$Log: not supported by cvs2svn $
 304 #Revision 1.4  2002/07/09 03:02:52  richard
 305 #More indexer work:
 306 #- all String properties may now be indexed too. Currently there's a bit of
 307 #  "issue" specific code in the actual searching which needs to be
 308 #  addressed. In a nutshell:
 309 #  + pass 'indexme="yes"' as a String() property initialisation arg, eg:
 310 #        file = FileClass(db, "file", name=String(), type=String(),
 311 #            comment=String(indexme="yes"))
 312 #  + the comment will then be indexed and be searchable, with the results
 313 #    related back to the issue that the file is linked to
 314 #- as a result of this work, the FileClass has a default MIME type that may
 315 #  be overridden in a subclass, or by the use of a "type" property as is
 316 #  done in the default templates.
 317 #- the regeneration of the indexes (if necessary) is done once the schema is
 318 #  set up in the dbinit.
 319 #
 320 #Revision 1.3  2002/07/08 06:58:15  richard
 321 #cleaned up the indexer code:
 322 # - it splits more words out (much simpler, faster splitter)
 323 # - removed code we'll never use (roundup.roundup_indexer has the full
 324 #   implementation, and replaces roundup.indexer)
 325 # - only index text/plain and rfc822/message (ideas for other text formats to
 326 #   index are welcome)
 327 # - added simple unit test for indexer. Needs more tests for regression.
 328 #
 329 #Revision 1.2  2002/05/25 07:16:24  rochecompaan
 330 #Merged search_indexing-branch with HEAD
 331 #
 332 #Revision 1.1.2.3  2002/05/02 11:52:12  rochecompaan
 333 #Fixed small bug that prevented indexes from being generated.
 334 #
 335 #Revision 1.1.2.2  2002/04/19 19:54:42  rochecompaan
 336 #cgi_client.py
 337 #    removed search link for the time being
 338 #    moved rendering of matches to htmltemplate
 339 #hyperdb.py
 340 #    filtering of nodes on full text search incorporated in filter method
 341 #roundupdb.py
 342 #    added paramater to call of filter method
 343 #roundup_indexer.py
 344 #    added search method to RoundupIndexer class
 345 #
 346 #Revision 1.1.2.1  2002/04/03 11:55:57  rochecompaan
 347 # . Added feature #526730 - search for messages capability
 348 #