index 3e4a7de78987e2af8dab931d17507fd7c5f04bee..83d91ae9f4ddf8c324d4f8589ff7433cd62bf45c 100644 (file)
# ok, find all the unique words in the text
text = unicode(text, "utf-8", "replace").upper()
wordlist = [w.encode("utf-8")
- for w in re.findall(r'(?u)\b\w{2,25}\b', text)]
+ for w in re.findall(r'(?u)\b\w{%d,%d}\b'
+ % (self.minlength, self.maxlength), text)]
words = set()
for word in wordlist:
if self.is_stopword(word): continue
- if len(word) > 25: continue
words.add(word)
# for each word, add an entry in the db
if not wordlist:
return []
- l = [word.upper() for word in wordlist if 26 > len(word) > 2]
+ l = [word.upper() for word in wordlist
+ if self.minlength <= len(word) <= self.maxlength]
+ l = [word for word in l if not self.is_stopword(word)]
if not l:
return []