Indexers behaviour made more consistent regarding length of indexed words

author ber <ber@57a73879-2fb5-44c3-a270-3262357dd7e2>

Fri, 11 Sep 2009 15:55:11 +0000 (15:55 +0000)

committer ber <ber@57a73879-2fb5-44c3-a270-3262357dd7e2>

Fri, 11 Sep 2009 15:55:11 +0000 (15:55 +0000)
author ber <ber@57a73879-2fb5-44c3-a270-3262357dd7e2>
Fri, 11 Sep 2009 15:55:11 +0000 (15:55 +0000)
committer ber <ber@57a73879-2fb5-44c3-a270-3262357dd7e2>
Fri, 11 Sep 2009 15:55:11 +0000 (15:55 +0000)
diff --git a/CHANGES.txt b/CHANGES.txt

index 46d1cf009bc30955c019f9cb53af0d121e159fbd..c7c47c50faabf00b2f867daf73d6a08a435f9399 100644 (file)
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -4,7 +4,8 @@ are given with the most recent entry first.
  2009-xx-xx 1.4.X
  
  Fixes:
-
+- Indexers behaviour made more consistent regarding length of indexed words
+  and stopwords (thanks Thomas Arendsen Hein, Bernhard Reiter)(issue 2550584)
  - fixed typos in the installation instructions (thanks Thomas Arendsen Hein)
    (issue 2550573) 
  
diff --git a/roundup/backends/indexer_common.py b/roundup/backends/indexer_common.py

index 5b5dd60c69f15fe3b15f0885aadcc2bea99b7d62..0f159aa527861dd4582d1bd086a52953d7151799 100644 (file)
--- a/roundup/backends/indexer_common.py
+++ b/roundup/backends/indexer_common.py
@@ -22,6 +22,10 @@ class Indexer:
          self.stopwords = set(STOPWORDS)
          for word in db.config[('main', 'indexer_stopwords')]:
              self.stopwords.add(word)
+        # Do not index anything longer than 25 characters since that'll be
+        # gibberish (encoded text or somesuch) or shorter than 2 characters
+        self.minlength = 2
+        self.maxlength = 25
  
      def is_stopword(self, word):
          return word in self.stopwords
diff --git a/roundup/backends/indexer_dbm.py b/roundup/backends/indexer_dbm.py

index 8483637ecea6c11ef3eef676d29647c6e51147a4..5b166b52e88e616e7e61a8e8907b054c85f77a54 100644 (file)
--- a/roundup/backends/indexer_dbm.py
+++ b/roundup/backends/indexer_dbm.py
@@ -135,14 +135,12 @@ class Indexer(IndexerBase):
          # case insensitive
          text = str(text).upper()
  
-        # Split the raw text, losing anything longer than 25 characters
-        # since that'll be gibberish (encoded text or somesuch) or shorter
-        # than 3 characters since those short words appear all over the
-        # place
-        return re.findall(r'\b\w{2,25}\b', text)
-
-    # we override this to ignore not 2 < word < 25 and also to fix a bug -
-    # the (fail) case.
+        # Split the raw text
+        return re.findall(r'\b\w{%d,%d}\b' % (self.minlength, self.maxlength),
+                          text)
+
+    # we override this to ignore too short and too long words
+    # and also to fix a bug - the (fail) case.
      def find(self, wordlist):
          '''Locate files that match ALL the words in wordlist
          '''
@@ -152,10 +150,12 @@ class Indexer(IndexerBase):
          entries = {}
          hits = None
          for word in wordlist:
-            if not 2 < len(word) < 25:
+            if not self.minlength <= len(word) <= self.maxlength:
                  # word outside the bounds of what we index - ignore
                  continue
              word = word.upper()
+            if self.is_stopword(word):
+                continue
              entry = self.words.get(word)    # For each word, get index
              entries[word] = entry           #   of matching files
              if not entry:                   # Nothing for this one word (fail)
diff --git a/roundup/backends/indexer_rdbms.py b/roundup/backends/indexer_rdbms.py

index 3e4a7de78987e2af8dab931d17507fd7c5f04bee..83d91ae9f4ddf8c324d4f8589ff7433cd62bf45c 100644 (file)
--- a/roundup/backends/indexer_rdbms.py
+++ b/roundup/backends/indexer_rdbms.py
@@ -66,11 +66,11 @@ class Indexer(IndexerBase):
          # ok, find all the unique words in the text
          text = unicode(text, "utf-8", "replace").upper()
          wordlist = [w.encode("utf-8")
-            for w in re.findall(r'(?u)\b\w{2,25}\b', text)]
+            for w in re.findall(r'(?u)\b\w{%d,%d}\b'
+                                % (self.minlength, self.maxlength), text)]
          words = set()
          for word in wordlist:
              if self.is_stopword(word): continue
-            if len(word) > 25: continue
              words.add(word)
  
          # for each word, add an entry in the db
@@ -86,7 +86,9 @@ class Indexer(IndexerBase):
          if not wordlist:
              return []
  
-        l = [word.upper() for word in wordlist if 26 > len(word) > 2]
+        l = [word.upper() for word in wordlist
+             if self.minlength <= len(word) <= self.maxlength]
+        l = [word for word in l if not self.is_stopword(word)]
  
          if not l:
              return []
diff --git a/roundup/backends/indexer_xapian.py b/roundup/backends/indexer_xapian.py

index ee38fd30d4445adb9013d59ea7eb903a67243782..38a7f2ee9d87cbd6075dcfa52522eb281a0a5442 100644 (file)
--- a/roundup/backends/indexer_xapian.py
+++ b/roundup/backends/indexer_xapian.py
@@ -88,7 +88,9 @@ class Indexer(IndexerBase):
          doc.set_data(identifier)
          doc.add_posting(identifier, 0)
  
-        for match in re.finditer(r'\b\w{2,25}\b', text.upper()):
+        for match in re.finditer(r'\b\w{%d,%d}\b'
+                                 % (self.minlength, self.maxlength),
+                                 text.upper()):
              word = match.group(0)
              if self.is_stopword(word):
                  continue
@@ -112,8 +114,10 @@ class Indexer(IndexerBase):
          enquire = xapian.Enquire(database)
          stemmer = xapian.Stem("english")
          terms = []
-        for term in [word.upper() for word in wordlist if 26 > len(word) > 2]:
-            terms.append(stemmer(term.upper()))
+        for term in [word.upper() for word in wordlist
+                          if self.minlength <= len(word) <= self.maxlength]:
+            if not self.is_stopword(term):
+                terms.append(stemmer(term))
          query = xapian.Query(xapian.Query.OP_AND, terms)
  
          enquire.set_query(query)
author	ber <ber@57a73879-2fb5-44c3-a270-3262357dd7e2>
	Fri, 11 Sep 2009 15:55:11 +0000 (15:55 +0000)
committer	ber <ber@57a73879-2fb5-44c3-a270-3262357dd7e2>
	Fri, 11 Sep 2009 15:55:11 +0000 (15:55 +0000)
CHANGES.txt		patch \| blob \| history
roundup/backends/indexer_common.py		patch \| blob \| history
roundup/backends/indexer_dbm.py		patch \| blob \| history
roundup/backends/indexer_rdbms.py		patch \| blob \| history
roundup/backends/indexer_xapian.py		patch \| blob \| history