Fixed #19237 -- Used HTML parser to strip tags

The regex method used until now for the strip_tags utility is fast, but subject to flaws and security issues. Consensus and good practice lead use to use a slower but safer method.
author: Claude Paroz <claude@2xlibre.net> 2013-05-22 17:29:16 +0200
committer: Claude Paroz <claude@2xlibre.net> 2013-05-22 17:34:02 +0200
commit: dc51ec8bc214cf60ebb99732363624c23df8005f (patch)
tree: 5b870ff55500ee2b3bed9547bafee290e86a29fe /django
parent: 01948e384f5508c126c7216e43db3654bf6330f0 (diff)
1 files changed, 26 insertions, 2 deletions
diff --git a/django/utils/html.py b/django/utils/html.py
index edddc48e62..573235092d 100644
--- a/django/utils/html.py
+++ b/django/utils/html.py
@@ -16,6 +16,9 @@ from django.utils.functional import allow_lazy
 from django.utils import six
 from django.utils.text import normalize_newlines
 
+from .html_parser import HTMLParser
+
+
 # Configuration for urlize() function.
 TRAILING_PUNCTUATION = ['.', ',', ':', ';', '.)']
 WRAPPING_PUNCTUATION = [('(', ')'), ('<', '>'), ('[', ']'), ('&lt;', '&gt;')]
@@ -33,7 +36,6 @@ link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+')
 html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE)
 hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL)
 trailing_empty_content_re = re.compile(r'(?:<p>(?:&nbsp;|\s|<br \/>)*?</p>\s*)+\Z')
-strip_tags_re = re.compile(r'</?\S([^=>]*=(\s*"[^"]*"|\s*\'[^\']*\'|\S*)|[^>])*?>', re.IGNORECASE)
 
 
 def escape(text):
@@ -116,9 +118,31 @@ def linebreaks(value, autoescape=False):
     return '\n\n'.join(paras)
 linebreaks = allow_lazy(linebreaks, six.text_type)
 
+
+class MLStripper(HTMLParser):
+    def __init__(self):
+        HTMLParser.__init__(self)
+        self.reset()
+        self.fed = []
+    def handle_data(self, d):
+        self.fed.append(d)
+    def handle_entityref(self, name):
+        self.fed.append('&%s;' % name)
+    def handle_charref(self, name):
+        self.fed.append('&#%s;' % name)
+    def get_data(self):
+        return ''.join(self.fed)
+
 def strip_tags(value):
     """Returns the given HTML with all tags stripped."""
-    return strip_tags_re.sub('', force_text(value))
+    s = MLStripper()
+    s.feed(value)
+    data = s.get_data()
+    try:
+        res = s.close()
+    except Exception as e:
+        data += s.rawdata
+    return data
 strip_tags = allow_lazy(strip_tags)
 
 def remove_tags(html, tags):
author	Claude Paroz <claude@2xlibre.net>	2013-05-22 17:29:16 +0200
committer	Claude Paroz <claude@2xlibre.net>	2013-05-22 17:34:02 +0200
commit	dc51ec8bc214cf60ebb99732363624c23df8005f (patch)
tree	5b870ff55500ee2b3bed9547bafee290e86a29fe /django
parent	01948e384f5508c126c7216e43db3654bf6330f0 (diff)