diff options
Diffstat (limited to 'django/utils/html.py')
| -rw-r--r-- | django/utils/html.py | 35 |
1 files changed, 33 insertions, 2 deletions
diff --git a/django/utils/html.py b/django/utils/html.py index 8b28d97d13..0d28c77a61 100644 --- a/django/utils/html.py +++ b/django/utils/html.py @@ -16,6 +16,9 @@ from django.utils.functional import allow_lazy from django.utils import six from django.utils.text import normalize_newlines +from .html_parser import HTMLParser, HTMLParseError + + # Configuration for urlize() function. TRAILING_PUNCTUATION = ['.', ',', ':', ';', '.)'] WRAPPING_PUNCTUATION = [('(', ')'), ('<', '>'), ('[', ']'), ('<', '>')] @@ -33,7 +36,6 @@ link_target_attribute_re = re.compile(r'(<a [^>]*?)target=[^\s>]+') html_gunk_re = re.compile(r'(?:<br clear="all">|<i><\/i>|<b><\/b>|<em><\/em>|<strong><\/strong>|<\/?smallcaps>|<\/?uppercase>)', re.IGNORECASE) hard_coded_bullets_re = re.compile(r'((?:<p>(?:%s).*?[a-zA-Z].*?</p>\s*)+)' % '|'.join([re.escape(x) for x in DOTS]), re.DOTALL) trailing_empty_content_re = re.compile(r'(?:<p>(?: |\s|<br \/>)*?</p>\s*)+\Z') -strip_tags_re = re.compile(r'</?\S([^=>]*=(\s*"[^"]*"|\s*\'[^\']*\'|\S*)|[^>])*?>', re.IGNORECASE) def escape(text): @@ -116,9 +118,31 @@ def linebreaks(value, autoescape=False): return '\n\n'.join(paras) linebreaks = allow_lazy(linebreaks, six.text_type) + +class MLStripper(HTMLParser): + def __init__(self): + HTMLParser.__init__(self) + self.reset() + self.fed = [] + def handle_data(self, d): + self.fed.append(d) + def handle_entityref(self, name): + self.fed.append('&%s;' % name) + def handle_charref(self, name): + self.fed.append('&#%s;' % name) + def get_data(self): + return ''.join(self.fed) + def strip_tags(value): """Returns the given HTML with all tags stripped.""" - return strip_tags_re.sub('', force_text(value)) + s = MLStripper() + try: + s.feed(value) + s.close() + except HTMLParseError: + return value + else: + return s.get_data() strip_tags = allow_lazy(strip_tags) def remove_tags(html, tags): @@ -281,3 +305,10 @@ def clean_html(text): text = trailing_empty_content_re.sub('', text) return text clean_html = allow_lazy(clean_html, six.text_type) + +def avoid_wrapping(value): + """ + Avoid text wrapping in the middle of a phrase by adding non-breaking + spaces where there previously were normal spaces. + """ + return value.replace(" ", "\xa0") |
