roundup/indexer.py

   1 #
   2 # This module is derived from the module described at:
   3 #   http://gnosis.cx/publish/programming/charming_python_15.txt
   4 #
   5 # Author: David Mertz (mertz@gnosis.cx)
   6 # Thanks to: Pat Knight (p.knight@ktgroup.co.uk)
   7 #            Gregory Popovitch (greg@gpy.com)
   8 #
   9 # The original module was released under this license, and remains under
  10 # it:
  11 #
  12 #     This file is released to the public domain.  I (dqm) would
  13 #     appreciate it if you choose to keep derived works under terms
  14 #     that promote freedom, but obviously am giving up any rights
  15 #     to compel such.
  16 #
  17 #$Id: indexer.py,v 1.3 2002-07-08 06:58:15 richard Exp $
  18 '''
  19 This module provides an indexer class, RoundupIndexer, that stores text
  20 indices in a roundup instance.  This class makes searching the content of
  21 messages and text files possible.
  22 '''
  23 import os, shutil, re, mimetypes, marshal, zlib, errno
  24
  25 class Indexer:
  26     ''' Indexes messages and files.
  27
  28         This implements a new splitter based on re.findall '\w+' and the
  29         add_othertext method.
  30     '''
  31     def __init__(self, db_path):
  32         indexdb_path = os.path.join(db_path, 'indexes')
  33
  34         # see if we need to reindex because of a change in code
  35         if (os.path.exists(indexdb_path) and
  36                 not os.path.exists(os.path.join(indexdb_path, 'version'))):
  37             shutil.rmtree(indexdb_path)
  38
  39         # see if the index exists
  40         index_exists = 0
  41         if not os.path.exists(indexdb_path):
  42             os.makedirs(indexdb_path)
  43             os.chmod(indexdb_path, 0775)
  44             open(os.path.join(indexdb_path, 'version'), 'w').write('1\n')
  45         else:
  46             index_exists = 1
  47
  48         # save off the path to the indexdb
  49         self.indexdb = os.path.join(indexdb_path, 'index.db')
  50         self.reindex = 0
  51         self.casesensitive = 0
  52         self.quiet = 9
  53
  54         if not index_exists:
  55             # index everything
  56             files_path = os.path.join(db_path, 'files')
  57             self.add_files(dir=files_path)
  58             self.save_index()
  59
  60     # override add_files so it's a little smarter about file types
  61     def add_files(self, dir):
  62         if not hasattr(self, 'files'):
  63             self.load_index()
  64         os.path.walk(dir, self.walk_add_file, None)
  65         # Rebuild the fileid index
  66         self.fileids = {}
  67         for fname in self.files.keys():
  68             fileid = self.files[fname][0]
  69             self.fileids[fileid] = fname
  70
  71     # override add_file so it can be a little smarter about determining the
  72     # file type
  73     def walk_add_file(self, arg, dname, names, ftype=None):
  74         for name in names:
  75             name = os.path.join(dname, name)
  76             if os.path.isfile(name):
  77                 self.add_file(name)
  78             elif os.path.isdir(name):
  79                 os.path.walk(name, self.walk_add_file, None)
  80     def add_file(self, fname, ftype=None):
  81         ''' Index the contents of a regular file
  82         '''
  83         if not hasattr(self, 'files'):
  84             self.load_index()
  85         # Is file eligible for (re)indexing?
  86         if self.files.has_key(fname):
  87             if self.reindex:
  88                 # Reindexing enabled, cleanup dicts
  89                 self.purge_entry(fname, self.files, self.words)
  90             else:
  91                 # DO NOT reindex this file
  92                 if self.quiet < 5:
  93                     print "Skipping", fname
  94                 return 0
  95
  96         # guess the file type
  97         if ftype is None:
  98             ftype = mimetypes.guess_type(fname)
  99
 100         # read in the file
 101         text = open(fname).read()
 102         if self.quiet < 5: print "Indexing", fname
 103         words = self.splitter(text, ftype)
 104
 105         # Find new file index, and assign it to filename
 106         # (_TOP uses trick of negative to avoid conflict with file index)
 107         self.files['_TOP'] = (self.files['_TOP'][0]-1, None)
 108         file_index =  abs(self.files['_TOP'][0])
 109         self.files[fname] = (file_index, len(words))
 110
 111         filedict = {}
 112         for word in words:
 113             if filedict.has_key(word):
 114                 filedict[word] = filedict[word]+1
 115             else:
 116                 filedict[word] = 1
 117
 118         for word in filedict.keys():
 119             if self.words.has_key(word):
 120                 entry = self.words[word]
 121             else:
 122                 entry = {}
 123             entry[file_index] = filedict[word]
 124             self.words[word] = entry
 125
 126     # NOTE: this method signature deviates from the one specified in
 127     # indexer - I'm not entirely sure where it was expected to the text
 128     # from otherwise...
 129     def add_othertext(self, identifier, text):
 130         ''' Add some text associated with the identifier
 131         '''
 132         # Is file eligible for (re)indexing?
 133         if self.files.has_key(identifier):
 134             # Reindexing enabled, cleanup dicts
 135             if self.reindex:
 136                 self.purge_entry(identifier, self.files, self.words)
 137             else:
 138                 # DO NOT reindex this file
 139                 if self.quiet < 5:
 140                     print "Not reindexing", identifier
 141                 return 0
 142
 143         # split into words
 144         words = self.splitter(text, 'text/plain')
 145
 146         # Find new file index, and assign it to identifier
 147         # (_TOP uses trick of negative to avoid conflict with file index)
 148         self.files['_TOP'] = (self.files['_TOP'][0]-1, None)
 149         file_index = abs(self.files['_TOP'][0])
 150         self.files[identifier] = (file_index, len(words))
 151         self.fileids[file_index] = identifier
 152
 153         # find the unique words
 154         filedict = {}
 155         for word in words:
 156             if filedict.has_key(word):
 157                 filedict[word] = filedict[word]+1
 158             else:
 159                 filedict[word] = 1
 160
 161         # now add to the totals
 162         for word in filedict.keys():
 163             # each word has a dict of {identifier: count}
 164             if self.words.has_key(word):
 165                 entry = self.words[word]
 166             else:
 167                 # new word
 168                 entry = {}
 169                 self.words[word] = entry
 170
 171             # make a reference to the file for this word
 172             entry[file_index] = filedict[word]
 173
 174     def splitter(self, text, ftype):
 175         ''' Split the contents of a text string into a list of 'words'
 176         '''
 177         if ftype in ('text/plain', 'message/rfc822'):
 178             words = self.text_splitter(text, self.casesensitive)
 179         else:
 180             return []
 181         return words
 182
 183     def text_splitter(self, text, casesensitive=0):
 184         """Split text/plain string into a list of words
 185         """
 186         # Let's adjust case if not case-sensitive
 187         if not casesensitive:
 188             text = text.upper()
 189
 190         # Split the raw text, losing anything longer than 25 characters
 191         # since that'll be gibberish (encoded text or somesuch) or shorter
 192         # than 3 characters since those short words appear all over the
 193         # place
 194         return re.findall(r'\b\w{2,25}\b', text)
 195
 196     def search(self, search_terms, klass):
 197         ''' display search results
 198         '''
 199         hits = self.find(search_terms)
 200         links = []
 201         nodeids = {}
 202         designator_propname = {'msg': 'messages', 'file': 'files'}
 203         if hits:
 204             hitcount = len(hits)
 205             # build a dictionary of nodes and their associated messages
 206             # and files
 207             for hit in hits.keys():
 208                 filename = hits[hit].split('/')[-1]
 209                 for designator, propname in designator_propname.items():
 210                     if not filename.startswith(designator):
 211                         continue
 212                     nodeid = filename[len(designator):]
 213                     result = apply(klass.find, (), {propname:nodeid})
 214                     if not result:
 215                         continue
 216
 217                     id = str(result[0])
 218                     if not nodeids.has_key(id):
 219                         nodeids[id] = {}
 220
 221                     node_dict = nodeids[id]
 222                     if not node_dict.has_key(propname):
 223                         node_dict[propname] = [nodeid]
 224                     elif node_dict.has_key(propname):
 225                         node_dict[propname].append(nodeid)
 226
 227         return nodeids
 228
 229     # we override this to ignore not 2 < word < 25 and also to fix a bug -
 230     # the (fail) case.
 231     def find(self, wordlist):
 232         ''' Locate files that match ALL the words in wordlist
 233         '''
 234         if not hasattr(self, 'words'):
 235             self.load_index()
 236         self.load_index(wordlist=wordlist)
 237         entries = {}
 238         hits = None
 239         for word in wordlist:
 240             if not 2 < len(word) < 25:
 241                 # word outside the bounds of what we index - ignore
 242                 continue
 243             if not self.casesensitive:
 244                 word = word.upper()
 245             entry = self.words.get(word)    # For each word, get index
 246             entries[word] = entry           #   of matching files
 247             if not entry:                   # Nothing for this one word (fail)
 248                 return {}
 249             if hits is None:
 250                 hits = {}
 251                 for k in entry.keys():
 252                     hits[k] = self.fileids[k]
 253             else:
 254                 # Eliminate hits for every non-match
 255                 for fileid in hits.keys():
 256                     if not entry.has_key(fileid):
 257                         del hits[fileid]
 258         if hits is None:
 259             return {}
 260         return hits
 261
 262     segments = "ABCDEFGHIJKLMNOPQRSTUVWXYZ#-!"
 263     def load_index(self, reload=0, wordlist=None):
 264         # Unless reload is indicated, do not load twice
 265         if self.index_loaded() and not reload:
 266             return 0
 267
 268         # Ok, now let's actually load it
 269         db = {'WORDS': {}, 'FILES': {'_TOP':(0,None)}, 'FILEIDS': {}}
 270
 271         # Identify the relevant word-dictionary segments
 272         if not wordlist:
 273             segments = self.segments
 274         else:
 275             segments = ['-','#']
 276             for word in wordlist:
 277                 segments.append(word[0].upper())
 278
 279         # Load the segments
 280         for segment in segments:
 281             try:
 282                 f = open(self.indexdb + segment, 'rb')
 283             except IOError, error:
 284                 if error.errno != errno.ENOENT:
 285                     raise
 286             else:
 287                 pickle_str = zlib.decompress(f.read())
 288                 f.close()
 289                 dbslice = marshal.loads(pickle_str)
 290                 if dbslice.get('WORDS'):
 291                     # if it has some words, add them
 292                     for word, entry in dbslice['WORDS'].items():
 293                         db['WORDS'][word] = entry
 294                 if dbslice.get('FILES'):
 295                     # if it has some files, add them
 296                     db['FILES'] = dbslice['FILES']
 297                 if dbslice.get('FILEIDS'):
 298                     # if it has fileids, add them
 299                     db['FILEIDS'] = dbslice['FILEIDS']
 300
 301         self.words = db['WORDS']
 302         self.files = db['FILES']
 303         self.fileids = db['FILEIDS']
 304
 305     def save_index(self):
 306         # brutal space saver... delete all the small segments
 307         for segment in self.segments:
 308             try:
 309                 os.remove(self.indexdb + segment)
 310             except OSError:
 311                 # probably just nonexistent segment index file
 312                 # TODO: make sure it's an EEXIST
 313                 pass
 314
 315         # First write the much simpler filename/fileid dictionaries
 316         dbfil = {'WORDS':None, 'FILES':self.files, 'FILEIDS':self.fileids}
 317         open(self.indexdb+'-','wb').write(zlib.compress(marshal.dumps(dbfil)))
 318
 319         # The hard part is splitting the word dictionary up, of course
 320         letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ#"
 321         segdicts = {}                           # Need batch of empty dicts
 322         for segment in letters:
 323             segdicts[segment] = {}
 324         for word, entry in self.words.items():  # Split into segment dicts
 325             initchar = word[0].upper()
 326             segdicts[initchar][word] = entry
 327
 328         # save
 329         for initchar in letters:
 330             db = {'WORDS':segdicts[initchar], 'FILES':None, 'FILEIDS':None}
 331             pickle_str = marshal.dumps(db)
 332             filename = self.indexdb + initchar
 333             pickle_fh = open(filename, 'wb')
 334             pickle_fh.write(zlib.compress(pickle_str))
 335             os.chmod(filename, 0664)
 336
 337     def purge_entry(self, fname, file_dct, word_dct):
 338         ''' Remove a file from file index and word index
 339         '''
 340         try:        # The easy part, cleanup the file index
 341             file_index = file_dct[fname]
 342             del file_dct[fname]
 343         except KeyError:
 344             pass    # We'll assume we only encounter KeyError's
 345         # The much harder part, cleanup the word index
 346         for word, occurs in word_dct.items():
 347             if occurs.has_key(file_index):
 348                 del occurs[file_index]
 349                 word_dct[word] = occurs
 350
 351     def index_loaded(self):
 352         return (hasattr(self,'fileids') and hasattr(self,'files') and
 353             hasattr(self,'words'))
 354
 355 #
 356 #$Log: not supported by cvs2svn $
 357 #Revision 1.2  2002/05/25 07:16:24  rochecompaan
 358 #Merged search_indexing-branch with HEAD
 359 #
 360 #Revision 1.1.2.3  2002/05/02 11:52:12  rochecompaan
 361 #Fixed small bug that prevented indexes from being generated.
 362 #
 363 #Revision 1.1.2.2  2002/04/19 19:54:42  rochecompaan
 364 #cgi_client.py
 365 #    removed search link for the time being
 366 #    moved rendering of matches to htmltemplate
 367 #hyperdb.py
 368 #    filtering of nodes on full text search incorporated in filter method
 369 #roundupdb.py
 370 #    added paramater to call of filter method
 371 #roundup_indexer.py
 372 #    added search method to RoundupIndexer class
 373 #
 374 #Revision 1.1.2.1  2002/04/03 11:55:57  rochecompaan
 375 # . Added feature #526730 - search for messages capability
 376 #