diff options
| author | David Smith <smithdc@gmail.com> | 2023-01-03 20:48:06 +0000 |
|---|---|---|
| committer | Mariusz Felisiak <felisiak.mariusz@gmail.com> | 2024-02-07 09:46:25 +0100 |
| commit | 6ee37ada3241ed263d8d1c2901b030d964cbd161 (patch) | |
| tree | b37d4c173f5a7621e9304055875eca8d1939a069 /django/utils | |
| parent | 70f39e46f86b946c273340d52109824c776ffb4c (diff) | |
Fixed #30686 -- Used Python HTMLParser in utils.text.Truncator.
Diffstat (limited to 'django/utils')
| -rw-r--r-- | django/utils/text.py | 215 |
1 files changed, 115 insertions, 100 deletions
diff --git a/django/utils/text.py b/django/utils/text.py index 374fd78f92..9560ebc678 100644 --- a/django/utils/text.py +++ b/django/utils/text.py @@ -2,12 +2,20 @@ import gzip import re import secrets import unicodedata +from collections import deque from gzip import GzipFile from gzip import compress as gzip_compress +from html import escape +from html.parser import HTMLParser from io import BytesIO from django.core.exceptions import SuspiciousFileOperation -from django.utils.functional import SimpleLazyObject, keep_lazy_text, lazy +from django.utils.functional import ( + SimpleLazyObject, + cached_property, + keep_lazy_text, + lazy, +) from django.utils.regex_helper import _lazy_re_compile from django.utils.translation import gettext as _ from django.utils.translation import gettext_lazy, pgettext @@ -80,6 +88,101 @@ def add_truncation_text(text, truncate=None): return f"{text}{truncate}" +def calculate_truncate_chars_length(length, replacement): + truncate_len = length + for char in add_truncation_text("", replacement): + if not unicodedata.combining(char): + truncate_len -= 1 + if truncate_len == 0: + break + return truncate_len + + +class TruncateHTMLParser(HTMLParser): + class TruncationCompleted(Exception): + pass + + def __init__(self, *, length, replacement, convert_charrefs=True): + super().__init__(convert_charrefs=convert_charrefs) + self.tags = deque() + self.output = "" + self.remaining = length + self.replacement = replacement + + @cached_property + def void_elements(self): + from django.utils.html import VOID_ELEMENTS + + return VOID_ELEMENTS + + def handle_startendtag(self, tag, attrs): + self.handle_starttag(tag, attrs) + if tag not in self.void_elements: + self.handle_endtag(tag) + + def handle_starttag(self, tag, attrs): + self.output += self.get_starttag_text() + if tag not in self.void_elements: + self.tags.appendleft(tag) + + def handle_endtag(self, tag): + if tag not in self.void_elements: + self.output += f"</{tag}>" + try: + self.tags.remove(tag) + except ValueError: + pass + + def handle_data(self, data): + data, output = self.process(data) + data_len = len(data) + if self.remaining < data_len: + self.remaining = 0 + self.output += add_truncation_text(output, self.replacement) + raise self.TruncationCompleted + self.remaining -= data_len + self.output += output + + def feed(self, data): + try: + super().feed(data) + except self.TruncationCompleted: + self.output += "".join([f"</{tag}>" for tag in self.tags]) + self.tags.clear() + self.reset() + else: + # No data was handled. + self.reset() + + +class TruncateCharsHTMLParser(TruncateHTMLParser): + def __init__(self, *, length, replacement, convert_charrefs=True): + self.length = length + self.processed_chars = 0 + super().__init__( + length=calculate_truncate_chars_length(length, replacement), + replacement=replacement, + convert_charrefs=convert_charrefs, + ) + + def process(self, data): + self.processed_chars += len(data) + if (self.processed_chars == self.length) and ( + len(self.output) + len(data) == len(self.rawdata) + ): + self.output += data + raise self.TruncationCompleted + output = escape("".join(data[: self.remaining])) + return data, output + + +class TruncateWordsHTMLParser(TruncateHTMLParser): + def process(self, data): + data = re.split(r"(?<=\S)\s+(?=\S)", data) + output = escape(" ".join(data[: self.remaining])) + return data, output + + class Truncator(SimpleLazyObject): """ An object used to truncate text, either by characters or words. @@ -108,19 +211,16 @@ class Truncator(SimpleLazyObject): return "" text = unicodedata.normalize("NFC", self._wrapped) - # Calculate the length to truncate to (max length - end_text length) - truncate_len = length - for char in add_truncation_text("", truncate): - if not unicodedata.combining(char): - truncate_len -= 1 - if truncate_len == 0: - break if html: - return self._truncate_html(length, truncate, text, truncate_len, False) - return self._text_chars(length, truncate, text, truncate_len) + parser = TruncateCharsHTMLParser(length=length, replacement=truncate) + parser.feed(text) + parser.close() + return parser.output + return self._text_chars(length, truncate, text) - def _text_chars(self, length, truncate, text, truncate_len): + def _text_chars(self, length, truncate, text): """Truncate a string after a certain number of chars.""" + truncate_len = calculate_truncate_chars_length(length, truncate) s_len = 0 end_index = None for i, char in enumerate(text): @@ -149,7 +249,10 @@ class Truncator(SimpleLazyObject): if length <= 0: return "" if html: - return self._truncate_html(length, truncate, self._wrapped, length, True) + parser = TruncateWordsHTMLParser(length=length, replacement=truncate) + parser.feed(self._wrapped) + parser.close() + return parser.output return self._text_words(length, truncate) def _text_words(self, length, truncate): @@ -164,94 +267,6 @@ class Truncator(SimpleLazyObject): return add_truncation_text(" ".join(words), truncate) return " ".join(words) - def _truncate_html(self, length, truncate, text, truncate_len, words): - """ - Truncate HTML to a certain number of chars (not counting tags and - comments), or, if words is True, then to a certain number of words. - Close opened tags if they were correctly closed in the given HTML. - - Preserve newlines in the HTML. - """ - if words and length <= 0: - return "" - - size_limited = False - if len(text) > self.MAX_LENGTH_HTML: - text = text[: self.MAX_LENGTH_HTML] - size_limited = True - - html4_singlets = ( - "br", - "col", - "link", - "base", - "img", - "param", - "area", - "hr", - "input", - ) - - # Count non-HTML chars/words and keep note of open tags - pos = 0 - end_text_pos = 0 - current_len = 0 - open_tags = [] - - regex = re_words if words else re_chars - - while current_len <= length: - m = regex.search(text, pos) - if not m: - # Checked through whole string - break - pos = m.end(0) - if m[1]: - # It's an actual non-HTML word or char - current_len += 1 - if current_len == truncate_len: - end_text_pos = pos - continue - # Check for tag - tag = re_tag.match(m[0]) - if not tag or current_len >= truncate_len: - # Don't worry about non tags or tags after our truncate point - continue - closing_tag, tagname, self_closing = tag.groups() - # Element names are always case-insensitive - tagname = tagname.lower() - if self_closing or tagname in html4_singlets: - pass - elif closing_tag: - # Check for match in open tags list - try: - i = open_tags.index(tagname) - except ValueError: - pass - else: - # SGML: An end tag closes, back to the matching start tag, - # all unclosed intervening start tags with omitted end tags - open_tags = open_tags[i + 1 :] - else: - # Add it to the start of the open tags list - open_tags.insert(0, tagname) - - truncate_text = add_truncation_text("", truncate) - - if current_len <= length: - if size_limited and truncate_text: - text += truncate_text - return text - - out = text[:end_text_pos] - if truncate_text: - out += truncate_text - # Close any tags still open - for tag in open_tags: - out += "</%s>" % tag - # Return string - return out - @keep_lazy_text def get_valid_filename(name): |
