summaryrefslogtreecommitdiff
path: root/django/utils/stopwords.py
diff options
context:
space:
mode:
authorAdrian Holovaty <adrian@holovaty.com>2005-07-13 01:25:57 +0000
committerAdrian Holovaty <adrian@holovaty.com>2005-07-13 01:25:57 +0000
commited114e15106192b22ebb78ef5bf5bce72b419d13 (patch)
treef7c27f035cca8d50bd69e2ecbd7497fccec4a35a /django/utils/stopwords.py
parent07ffc7d605cc96557db28a9e35da69bc0719611b (diff)
Imported Django from private SVN repository (created from r. 8825)
git-svn-id: http://code.djangoproject.com/svn/django/trunk@3 bcc190cf-cafb-0310-a4f2-bffc1f526a37
Diffstat (limited to 'django/utils/stopwords.py')
-rw-r--r--django/utils/stopwords.py42
1 files changed, 42 insertions, 0 deletions
diff --git a/django/utils/stopwords.py b/django/utils/stopwords.py
new file mode 100644
index 0000000000..dea5660413
--- /dev/null
+++ b/django/utils/stopwords.py
@@ -0,0 +1,42 @@
+# Performance note: I benchmarked this code using a set instead of
+# a list for the stopwords and was surprised to find that the list
+# performed /better/ than the set - maybe because it's only a small
+# list.
+
+stopwords = '''
+i
+a
+an
+are
+as
+at
+be
+by
+for
+from
+how
+in
+is
+it
+of
+on
+or
+that
+the
+this
+to
+was
+what
+when
+where
+'''.split()
+
+def strip_stopwords(sentence):
+ "Removes stopwords - also normalizes whitespace"
+ words = sentence.split()
+ sentence = []
+ for word in words:
+ if word.lower() not in stopwords:
+ sentence.append(word)
+ return ' '.join(sentence)
+