diff options
Diffstat (limited to 'django/utils/html.py')
| -rw-r--r-- | django/utils/html.py | 80 |
1 files changed, 45 insertions, 35 deletions
diff --git a/django/utils/html.py b/django/utils/html.py index e1860627ce..ebd04d1b3c 100644 --- a/django/utils/html.py +++ b/django/utils/html.py @@ -1,7 +1,10 @@ "HTML utilities suitable for global use." -import re, string -from django.utils.encoding import smart_unicode +import re +import string + +from django.utils.encoding import force_unicode +from django.utils.functional import allow_lazy # Configuration for urlize() function LEADING_PUNCTUATION = ['(', '<', '<'] @@ -23,40 +26,45 @@ trailing_empty_content_re = re.compile(r'(?:<p>(?: |\s|<br \/>)*?</p>\s*)+\ del x # Temporary variable def escape(html): - "Returns the given HTML with ampersands, quotes and carets encoded" - if not isinstance(html, basestring): - html = str(html) - return html.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''') + "Return the given HTML with ampersands, quotes and carets encoded." + return force_unicode(html).replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''') +escape = allow_lazy(escape, unicode) def linebreaks(value): - "Converts newlines into <p> and <br />s" - value = re.sub(r'\r\n|\r|\n', '\n', value) # normalize newlines + "Convert newlines into <p> and <br />s." + value = re.sub(r'\r\n|\r|\n', '\n', force_unicode(value)) # normalize newlines paras = re.split('\n{2,}', value) - paras = ['<p>%s</p>' % p.strip().replace('\n', '<br />') for p in paras] - return '\n\n'.join(paras) + paras = [u'<p>%s</p>' % p.strip().replace('\n', '<br />') for p in paras] + return u'\n\n'.join(paras) +linebreaks = allow_lazy(linebreaks, unicode) def strip_tags(value): - "Returns the given HTML with all tags stripped" - return re.sub(r'<[^>]*?>', '', value) + "Return the given HTML with all tags stripped." + return re.sub(r'<[^>]*?>', '', force_unicode(value)) +strip_tags = allow_lazy(strip_tags) def strip_spaces_between_tags(value): - "Returns the given HTML with spaces between tags removed" - return re.sub(r'>\s+<', '><', value) + "Return the given HTML with spaces between tags removed." + return re.sub(r'>\s+<', '><', force_unicode(value)) +strip_spaces_between_tags = allow_lazy(strip_spaces_between_tags, unicode) def strip_entities(value): - "Returns the given HTML with all entities (&something;) stripped" - return re.sub(r'&(?:\w+|#\d);', '', value) + "Return the given HTML with all entities (&something;) stripped." + return re.sub(r'&(?:\w+|#\d+);', '', force_unicode(value)) +strip_entities = allow_lazy(strip_entities, unicode) def fix_ampersands(value): - "Returns the given HTML with all unencoded ampersands encoded correctly" - return unencoded_ampersands_re.sub('&', value) + "Return the given HTML with all unencoded ampersands encoded correctly." + return unencoded_ampersands_re.sub('&', force_unicode(value)) +fix_ampersands = allow_lazy(fix_ampersands, unicode) def urlize(text, trim_url_limit=None, nofollow=False): """ - Converts any URLs in text into clickable links. Works on http://, https:// - and www. links. Links can have trailing punctuation (periods, commas, - close-parens) and leading punctuation (opening parens) and it'll still do - the right thing. + Convert any URLs in text into clickable links. + + Works on http://, https://, and www. links. Links can have trailing + punctuation (periods, commas, close-parens) and leading punctuation + (opening parens) and it'll still do the right thing. If trim_url_limit is not None, the URLs in link text longer than this limit will truncated to trim_url_limit-3 characters and appended with an elipsis. @@ -65,7 +73,7 @@ def urlize(text, trim_url_limit=None, nofollow=False): attribute. """ trim_url = lambda x, limit=trim_url_limit: limit is not None and (len(x) > limit and ('%s...' % x[:max(0, limit - 3)])) or x - words = word_split_re.split(text) + words = word_split_re.split(force_unicode(text)) nofollow_attr = nofollow and ' rel="nofollow"' or '' for i, word in enumerate(words): match = punctuation_re.match(word) @@ -82,22 +90,23 @@ def urlize(text, trim_url_limit=None, nofollow=False): middle = '<a href="mailto:%s">%s</a>' % (middle, middle) if lead + middle + trail != word: words[i] = lead + middle + trail - return ''.join(words) + return u''.join(words) +urlize = allow_lazy(urlize, unicode) def clean_html(text): """ - Cleans the given HTML. Specifically, it does the following: - * Converts <b> and <i> to <strong> and <em>. - * Encodes all ampersands correctly. - * Removes all "target" attributes from <a> tags. - * Removes extraneous HTML, such as presentational tags that open and + Clean the given HTML. Specifically, do the following: + * Convert <b> and <i> to <strong> and <em>. + * Encode all ampersands correctly. + * Remove all "target" attributes from <a> tags. + * Remove extraneous HTML, such as presentational tags that open and immediately close and <br clear="all">. - * Converts hard-coded bullets into HTML unordered lists. - * Removes stuff like "<p> </p>", but only if it's at the + * Convert hard-coded bullets into HTML unordered lists. + * Remove stuff like "<p> </p>", but only if it's at the bottom of the text. """ from django.utils.text import normalize_newlines - text = normalize_newlines(text) + text = normalize_newlines(force_unicode(text)) text = re.sub(r'<(/?)\s*b\s*>', '<\\1strong>', text) text = re.sub(r'<(/?)\s*i\s*>', '<\\1em>', text) text = fix_ampersands(text) @@ -110,9 +119,10 @@ def clean_html(text): s = match.group().replace('</p>', '</li>') for d in DOTS: s = s.replace('<p>%s' % d, '<li>') - return '<ul>\n%s\n</ul>' % s + return u'<ul>\n%s\n</ul>' % s text = hard_coded_bullets_re.sub(replace_p_tags, text) - # Remove stuff like "<p> </p>", but only if it's at the bottom of the text. + # Remove stuff like "<p> </p>", but only if it's at the bottom + # of the text. text = trailing_empty_content_re.sub('', text) return text - +clean_html = allow_lazy(clean_html, unicode) |
