From: ber Date: Fri, 11 Sep 2009 15:55:11 +0000 (+0000) Subject: Indexers behaviour made more consistent regarding length of indexed words X-Git-Url: https://git.tokkee.org/?a=commitdiff_plain;h=9df3d85d16ed35b7b8a45ad035fa944557cc429a;p=roundup.git Indexers behaviour made more consistent regarding length of indexed words and stopwords (thanks Thomas Arendsen Hein, Bernhard Reiter)(issue 2550584) git-svn-id: http://svn.roundup-tracker.org/svnroot/roundup/roundup/trunk@4356 57a73879-2fb5-44c3-a270-3262357dd7e2 --- diff --git a/CHANGES.txt b/CHANGES.txt index 46d1cf0..c7c47c5 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -4,7 +4,8 @@ are given with the most recent entry first. 2009-xx-xx 1.4.X Fixes: - +- Indexers behaviour made more consistent regarding length of indexed words + and stopwords (thanks Thomas Arendsen Hein, Bernhard Reiter)(issue 2550584) - fixed typos in the installation instructions (thanks Thomas Arendsen Hein) (issue 2550573) diff --git a/roundup/backends/indexer_common.py b/roundup/backends/indexer_common.py index 5b5dd60..0f159aa 100644 --- a/roundup/backends/indexer_common.py +++ b/roundup/backends/indexer_common.py @@ -22,6 +22,10 @@ class Indexer: self.stopwords = set(STOPWORDS) for word in db.config[('main', 'indexer_stopwords')]: self.stopwords.add(word) + # Do not index anything longer than 25 characters since that'll be + # gibberish (encoded text or somesuch) or shorter than 2 characters + self.minlength = 2 + self.maxlength = 25 def is_stopword(self, word): return word in self.stopwords diff --git a/roundup/backends/indexer_dbm.py b/roundup/backends/indexer_dbm.py index 8483637..5b166b5 100644 --- a/roundup/backends/indexer_dbm.py +++ b/roundup/backends/indexer_dbm.py @@ -135,14 +135,12 @@ class Indexer(IndexerBase): # case insensitive text = str(text).upper() - # Split the raw text, losing anything longer than 25 characters - # since that'll be gibberish (encoded text or somesuch) or shorter - # than 3 characters since those short words appear all over the - # place - return re.findall(r'\b\w{2,25}\b', text) - - # we override this to ignore not 2 < word < 25 and also to fix a bug - - # the (fail) case. + # Split the raw text + return re.findall(r'\b\w{%d,%d}\b' % (self.minlength, self.maxlength), + text) + + # we override this to ignore too short and too long words + # and also to fix a bug - the (fail) case. def find(self, wordlist): '''Locate files that match ALL the words in wordlist ''' @@ -152,10 +150,12 @@ class Indexer(IndexerBase): entries = {} hits = None for word in wordlist: - if not 2 < len(word) < 25: + if not self.minlength <= len(word) <= self.maxlength: # word outside the bounds of what we index - ignore continue word = word.upper() + if self.is_stopword(word): + continue entry = self.words.get(word) # For each word, get index entries[word] = entry # of matching files if not entry: # Nothing for this one word (fail) diff --git a/roundup/backends/indexer_rdbms.py b/roundup/backends/indexer_rdbms.py index 3e4a7de..83d91ae 100644 --- a/roundup/backends/indexer_rdbms.py +++ b/roundup/backends/indexer_rdbms.py @@ -66,11 +66,11 @@ class Indexer(IndexerBase): # ok, find all the unique words in the text text = unicode(text, "utf-8", "replace").upper() wordlist = [w.encode("utf-8") - for w in re.findall(r'(?u)\b\w{2,25}\b', text)] + for w in re.findall(r'(?u)\b\w{%d,%d}\b' + % (self.minlength, self.maxlength), text)] words = set() for word in wordlist: if self.is_stopword(word): continue - if len(word) > 25: continue words.add(word) # for each word, add an entry in the db @@ -86,7 +86,9 @@ class Indexer(IndexerBase): if not wordlist: return [] - l = [word.upper() for word in wordlist if 26 > len(word) > 2] + l = [word.upper() for word in wordlist + if self.minlength <= len(word) <= self.maxlength] + l = [word for word in l if not self.is_stopword(word)] if not l: return [] diff --git a/roundup/backends/indexer_xapian.py b/roundup/backends/indexer_xapian.py index ee38fd3..38a7f2e 100644 --- a/roundup/backends/indexer_xapian.py +++ b/roundup/backends/indexer_xapian.py @@ -88,7 +88,9 @@ class Indexer(IndexerBase): doc.set_data(identifier) doc.add_posting(identifier, 0) - for match in re.finditer(r'\b\w{2,25}\b', text.upper()): + for match in re.finditer(r'\b\w{%d,%d}\b' + % (self.minlength, self.maxlength), + text.upper()): word = match.group(0) if self.is_stopword(word): continue @@ -112,8 +114,10 @@ class Indexer(IndexerBase): enquire = xapian.Enquire(database) stemmer = xapian.Stem("english") terms = [] - for term in [word.upper() for word in wordlist if 26 > len(word) > 2]: - terms.append(stemmer(term.upper())) + for term in [word.upper() for word in wordlist + if self.minlength <= len(word) <= self.maxlength]: + if not self.is_stopword(term): + terms.append(stemmer(term)) query = xapian.Query(xapian.Query.OP_AND, terms) enquire.set_query(query)