summary | shortlog | log | commit | commitdiff | tree
raw | patch | inline | side by side (parent: f4046fb)
raw | patch | inline | side by side (parent: f4046fb)
author | ber <ber@57a73879-2fb5-44c3-a270-3262357dd7e2> | |
Fri, 11 Sep 2009 15:55:11 +0000 (15:55 +0000) | ||
committer | ber <ber@57a73879-2fb5-44c3-a270-3262357dd7e2> | |
Fri, 11 Sep 2009 15:55:11 +0000 (15:55 +0000) |
and stopwords (thanks Thomas Arendsen Hein, Bernhard Reiter)(issue 2550584)
git-svn-id: http://svn.roundup-tracker.org/svnroot/roundup/roundup/trunk@4356 57a73879-2fb5-44c3-a270-3262357dd7e2
git-svn-id: http://svn.roundup-tracker.org/svnroot/roundup/roundup/trunk@4356 57a73879-2fb5-44c3-a270-3262357dd7e2
diff --git a/CHANGES.txt b/CHANGES.txt
index 46d1cf009bc30955c019f9cb53af0d121e159fbd..c7c47c50faabf00b2f867daf73d6a08a435f9399 100644 (file)
--- a/CHANGES.txt
+++ b/CHANGES.txt
2009-xx-xx 1.4.X
Fixes:
-
+- Indexers behaviour made more consistent regarding length of indexed words
+ and stopwords (thanks Thomas Arendsen Hein, Bernhard Reiter)(issue 2550584)
- fixed typos in the installation instructions (thanks Thomas Arendsen Hein)
(issue 2550573)
index 5b5dd60c69f15fe3b15f0885aadcc2bea99b7d62..0f159aa527861dd4582d1bd086a52953d7151799 100644 (file)
self.stopwords = set(STOPWORDS)
for word in db.config[('main', 'indexer_stopwords')]:
self.stopwords.add(word)
+ # Do not index anything longer than 25 characters since that'll be
+ # gibberish (encoded text or somesuch) or shorter than 2 characters
+ self.minlength = 2
+ self.maxlength = 25
def is_stopword(self, word):
return word in self.stopwords
index 8483637ecea6c11ef3eef676d29647c6e51147a4..5b166b52e88e616e7e61a8e8907b054c85f77a54 100644 (file)
# case insensitive
text = str(text).upper()
- # Split the raw text, losing anything longer than 25 characters
- # since that'll be gibberish (encoded text or somesuch) or shorter
- # than 3 characters since those short words appear all over the
- # place
- return re.findall(r'\b\w{2,25}\b', text)
-
- # we override this to ignore not 2 < word < 25 and also to fix a bug -
- # the (fail) case.
+ # Split the raw text
+ return re.findall(r'\b\w{%d,%d}\b' % (self.minlength, self.maxlength),
+ text)
+
+ # we override this to ignore too short and too long words
+ # and also to fix a bug - the (fail) case.
def find(self, wordlist):
'''Locate files that match ALL the words in wordlist
'''
entries = {}
hits = None
for word in wordlist:
- if not 2 < len(word) < 25:
+ if not self.minlength <= len(word) <= self.maxlength:
# word outside the bounds of what we index - ignore
continue
word = word.upper()
+ if self.is_stopword(word):
+ continue
entry = self.words.get(word) # For each word, get index
entries[word] = entry # of matching files
if not entry: # Nothing for this one word (fail)
index 3e4a7de78987e2af8dab931d17507fd7c5f04bee..83d91ae9f4ddf8c324d4f8589ff7433cd62bf45c 100644 (file)
# ok, find all the unique words in the text
text = unicode(text, "utf-8", "replace").upper()
wordlist = [w.encode("utf-8")
- for w in re.findall(r'(?u)\b\w{2,25}\b', text)]
+ for w in re.findall(r'(?u)\b\w{%d,%d}\b'
+ % (self.minlength, self.maxlength), text)]
words = set()
for word in wordlist:
if self.is_stopword(word): continue
- if len(word) > 25: continue
words.add(word)
# for each word, add an entry in the db
if not wordlist:
return []
- l = [word.upper() for word in wordlist if 26 > len(word) > 2]
+ l = [word.upper() for word in wordlist
+ if self.minlength <= len(word) <= self.maxlength]
+ l = [word for word in l if not self.is_stopword(word)]
if not l:
return []
index ee38fd30d4445adb9013d59ea7eb903a67243782..38a7f2ee9d87cbd6075dcfa52522eb281a0a5442 100644 (file)
doc.set_data(identifier)
doc.add_posting(identifier, 0)
- for match in re.finditer(r'\b\w{2,25}\b', text.upper()):
+ for match in re.finditer(r'\b\w{%d,%d}\b'
+ % (self.minlength, self.maxlength),
+ text.upper()):
word = match.group(0)
if self.is_stopword(word):
continue
enquire = xapian.Enquire(database)
stemmer = xapian.Stem("english")
terms = []
- for term in [word.upper() for word in wordlist if 26 > len(word) > 2]:
- terms.append(stemmer(term.upper()))
+ for term in [word.upper() for word in wordlist
+ if self.minlength <= len(word) <= self.maxlength]:
+ if not self.is_stopword(term):
+ terms.append(stemmer(term))
query = xapian.Query(xapian.Query.OP_AND, terms)
enquire.set_query(query)