From 9df3d85d16ed35b7b8a45ad035fa944557cc429a Mon Sep 17 00:00:00 2001
From: ber <ber@57a73879-2fb5-44c3-a270-3262357dd7e2>
Date: Fri, 11 Sep 2009 15:55:11 +0000
Subject: [PATCH] Indexers behaviour made more consistent regarding length of
 indexed words and stopwords (thanks Thomas Arendsen Hein, Bernhard
 Reiter)(issue 2550584)

git-svn-id: http://svn.roundup-tracker.org/svnroot/roundup/roundup/trunk@4356 57a73879-2fb5-44c3-a270-3262357dd7e2
---
 CHANGES.txt                        |  3 ++-
 roundup/backends/indexer_common.py |  4 ++++
 roundup/backends/indexer_dbm.py    | 18 +++++++++---------
 roundup/backends/indexer_rdbms.py  |  8 +++++---
 roundup/backends/indexer_xapian.py | 10 +++++++---
 5 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 46d1cf0..c7c47c5 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -4,7 +4,8 @@ are given with the most recent entry first.
 2009-xx-xx 1.4.X
 
 Fixes:
-
+- Indexers behaviour made more consistent regarding length of indexed words
+  and stopwords (thanks Thomas Arendsen Hein, Bernhard Reiter)(issue 2550584)
 - fixed typos in the installation instructions (thanks Thomas Arendsen Hein)
   (issue 2550573) 
 
diff --git a/roundup/backends/indexer_common.py b/roundup/backends/indexer_common.py
index 5b5dd60..0f159aa 100644
--- a/roundup/backends/indexer_common.py
+++ b/roundup/backends/indexer_common.py
@@ -22,6 +22,10 @@ class Indexer:
         self.stopwords = set(STOPWORDS)
         for word in db.config[('main', 'indexer_stopwords')]:
             self.stopwords.add(word)
+        # Do not index anything longer than 25 characters since that'll be
+        # gibberish (encoded text or somesuch) or shorter than 2 characters
+        self.minlength = 2
+        self.maxlength = 25
 
     def is_stopword(self, word):
         return word in self.stopwords
diff --git a/roundup/backends/indexer_dbm.py b/roundup/backends/indexer_dbm.py
index 8483637..5b166b5 100644
--- a/roundup/backends/indexer_dbm.py
+++ b/roundup/backends/indexer_dbm.py
@@ -135,14 +135,12 @@ class Indexer(IndexerBase):
         # case insensitive
         text = str(text).upper()
 
-        # Split the raw text, losing anything longer than 25 characters
-        # since that'll be gibberish (encoded text or somesuch) or shorter
-        # than 3 characters since those short words appear all over the
-        # place
-        return re.findall(r'\b\w{2,25}\b', text)
-
-    # we override this to ignore not 2 < word < 25 and also to fix a bug -
-    # the (fail) case.
+        # Split the raw text
+        return re.findall(r'\b\w{%d,%d}\b' % (self.minlength, self.maxlength),
+                          text)
+
+    # we override this to ignore too short and too long words
+    # and also to fix a bug - the (fail) case.
     def find(self, wordlist):
         '''Locate files that match ALL the words in wordlist
         '''
@@ -152,10 +150,12 @@ class Indexer(IndexerBase):
         entries = {}
         hits = None
         for word in wordlist:
-            if not 2 < len(word) < 25:
+            if not self.minlength <= len(word) <= self.maxlength:
                 # word outside the bounds of what we index - ignore
                 continue
             word = word.upper()
+            if self.is_stopword(word):
+                continue
             entry = self.words.get(word)    # For each word, get index
             entries[word] = entry           #   of matching files
             if not entry:                   # Nothing for this one word (fail)
diff --git a/roundup/backends/indexer_rdbms.py b/roundup/backends/indexer_rdbms.py
index 3e4a7de..83d91ae 100644
--- a/roundup/backends/indexer_rdbms.py
+++ b/roundup/backends/indexer_rdbms.py
@@ -66,11 +66,11 @@ class Indexer(IndexerBase):
         # ok, find all the unique words in the text
         text = unicode(text, "utf-8", "replace").upper()
         wordlist = [w.encode("utf-8")
-            for w in re.findall(r'(?u)\b\w{2,25}\b', text)]
+            for w in re.findall(r'(?u)\b\w{%d,%d}\b'
+                                % (self.minlength, self.maxlength), text)]
         words = set()
         for word in wordlist:
             if self.is_stopword(word): continue
-            if len(word) > 25: continue
             words.add(word)
 
         # for each word, add an entry in the db
@@ -86,7 +86,9 @@ class Indexer(IndexerBase):
         if not wordlist:
             return []
 
-        l = [word.upper() for word in wordlist if 26 > len(word) > 2]
+        l = [word.upper() for word in wordlist
+             if self.minlength <= len(word) <= self.maxlength]
+        l = [word for word in l if not self.is_stopword(word)]
 
         if not l:
             return []
diff --git a/roundup/backends/indexer_xapian.py b/roundup/backends/indexer_xapian.py
index ee38fd3..38a7f2e 100644
--- a/roundup/backends/indexer_xapian.py
+++ b/roundup/backends/indexer_xapian.py
@@ -88,7 +88,9 @@ class Indexer(IndexerBase):
         doc.set_data(identifier)
         doc.add_posting(identifier, 0)
 
-        for match in re.finditer(r'\b\w{2,25}\b', text.upper()):
+        for match in re.finditer(r'\b\w{%d,%d}\b'
+                                 % (self.minlength, self.maxlength),
+                                 text.upper()):
             word = match.group(0)
             if self.is_stopword(word):
                 continue
@@ -112,8 +114,10 @@ class Indexer(IndexerBase):
         enquire = xapian.Enquire(database)
         stemmer = xapian.Stem("english")
         terms = []
-        for term in [word.upper() for word in wordlist if 26 > len(word) > 2]:
-            terms.append(stemmer(term.upper()))
+        for term in [word.upper() for word in wordlist
+                          if self.minlength <= len(word) <= self.maxlength]:
+            if not self.is_stopword(term):
+                terms.append(stemmer(term))
         query = xapian.Query(xapian.Query.OP_AND, terms)
 
         enquire.set_query(query)
-- 
2.39.5