diff options
| author | Adrian Holovaty <adrian@holovaty.com> | 2005-07-13 01:25:57 +0000 |
|---|---|---|
| committer | Adrian Holovaty <adrian@holovaty.com> | 2005-07-13 01:25:57 +0000 |
| commit | ed114e15106192b22ebb78ef5bf5bce72b419d13 (patch) | |
| tree | f7c27f035cca8d50bd69e2ecbd7497fccec4a35a /django/utils/stopwords.py | |
| parent | 07ffc7d605cc96557db28a9e35da69bc0719611b (diff) | |
Imported Django from private SVN repository (created from r. 8825)
git-svn-id: http://code.djangoproject.com/svn/django/trunk@3 bcc190cf-cafb-0310-a4f2-bffc1f526a37
Diffstat (limited to 'django/utils/stopwords.py')
| -rw-r--r-- | django/utils/stopwords.py | 42 |
1 files changed, 42 insertions, 0 deletions
diff --git a/django/utils/stopwords.py b/django/utils/stopwords.py new file mode 100644 index 0000000000..dea5660413 --- /dev/null +++ b/django/utils/stopwords.py @@ -0,0 +1,42 @@ +# Performance note: I benchmarked this code using a set instead of +# a list for the stopwords and was surprised to find that the list +# performed /better/ than the set - maybe because it's only a small +# list. + +stopwords = ''' +i +a +an +are +as +at +be +by +for +from +how +in +is +it +of +on +or +that +the +this +to +was +what +when +where +'''.split() + +def strip_stopwords(sentence): + "Removes stopwords - also normalizes whitespace" + words = sentence.split() + sentence = [] + for word in words: + if word.lower() not in stopwords: + sentence.append(word) + return ' '.join(sentence) + |
