Fixed #30686 -- Used Python HTMLParser in utils.text.Truncator.

author: David Smith <smithdc@gmail.com> 2023-01-03 20:48:06 +0000
committer: Mariusz Felisiak <felisiak.mariusz@gmail.com> 2024-02-07 09:46:25 +0100
commit: 6ee37ada3241ed263d8d1c2901b030d964cbd161 (patch)
tree: b37d4c173f5a7621e9304055875eca8d1939a069
parent: 70f39e46f86b946c273340d52109824c776ffb4c (diff)
4 files changed, 149 insertions, 125 deletions
diff --git a/django/utils/text.py b/django/utils/text.py
index 374fd78f92..9560ebc678 100644
--- a/django/utils/text.py
+++ b/django/utils/text.py
@@ -2,12 +2,20 @@ import gzip
 import re
 import secrets
 import unicodedata
+from collections import deque
 from gzip import GzipFile
 from gzip import compress as gzip_compress
+from html import escape
+from html.parser import HTMLParser
 from io import BytesIO
 
 from django.core.exceptions import SuspiciousFileOperation
-from django.utils.functional import SimpleLazyObject, keep_lazy_text, lazy
+from django.utils.functional import (
+    SimpleLazyObject,
+    cached_property,
+    keep_lazy_text,
+    lazy,
+)
 from django.utils.regex_helper import _lazy_re_compile
 from django.utils.translation import gettext as _
 from django.utils.translation import gettext_lazy, pgettext
@@ -80,6 +88,101 @@ def add_truncation_text(text, truncate=None):
     return f"{text}{truncate}"
 
 
+def calculate_truncate_chars_length(length, replacement):
+    truncate_len = length
+    for char in add_truncation_text("", replacement):
+        if not unicodedata.combining(char):
+            truncate_len -= 1
+            if truncate_len == 0:
+                break
+    return truncate_len
+
+
+class TruncateHTMLParser(HTMLParser):
+    class TruncationCompleted(Exception):
+        pass
+
+    def __init__(self, *, length, replacement, convert_charrefs=True):
+        super().__init__(convert_charrefs=convert_charrefs)
+        self.tags = deque()
+        self.output = ""
+        self.remaining = length
+        self.replacement = replacement
+
+    @cached_property
+    def void_elements(self):
+        from django.utils.html import VOID_ELEMENTS
+
+        return VOID_ELEMENTS
+
+    def handle_startendtag(self, tag, attrs):
+        self.handle_starttag(tag, attrs)
+        if tag not in self.void_elements:
+            self.handle_endtag(tag)
+
+    def handle_starttag(self, tag, attrs):
+        self.output += self.get_starttag_text()
+        if tag not in self.void_elements:
+            self.tags.appendleft(tag)
+
+    def handle_endtag(self, tag):
+        if tag not in self.void_elements:
+            self.output += f"</{tag}>"
+            try:
+                self.tags.remove(tag)
+            except ValueError:
+                pass
+
+    def handle_data(self, data):
+        data, output = self.process(data)
+        data_len = len(data)
+        if self.remaining < data_len:
+            self.remaining = 0
+            self.output += add_truncation_text(output, self.replacement)
+            raise self.TruncationCompleted
+        self.remaining -= data_len
+        self.output += output
+
+    def feed(self, data):
+        try:
+            super().feed(data)
+        except self.TruncationCompleted:
+            self.output += "".join([f"</{tag}>" for tag in self.tags])
+            self.tags.clear()
+            self.reset()
+        else:
+            # No data was handled.
+            self.reset()
+
+
+class TruncateCharsHTMLParser(TruncateHTMLParser):
+    def __init__(self, *, length, replacement, convert_charrefs=True):
+        self.length = length
+        self.processed_chars = 0
+        super().__init__(
+            length=calculate_truncate_chars_length(length, replacement),
+            replacement=replacement,
+            convert_charrefs=convert_charrefs,
+        )
+
+    def process(self, data):
+        self.processed_chars += len(data)
+        if (self.processed_chars == self.length) and (
+            len(self.output) + len(data) == len(self.rawdata)
+        ):
+            self.output += data
+            raise self.TruncationCompleted
+        output = escape("".join(data[: self.remaining]))
+        return data, output
+
+
+class TruncateWordsHTMLParser(TruncateHTMLParser):
+    def process(self, data):
+        data = re.split(r"(?<=\S)\s+(?=\S)", data)
+        output = escape(" ".join(data[: self.remaining]))
+        return data, output
+
+
 class Truncator(SimpleLazyObject):
     """
     An object used to truncate text, either by characters or words.
@@ -108,19 +211,16 @@ class Truncator(SimpleLazyObject):
             return ""
         text = unicodedata.normalize("NFC", self._wrapped)
 
-        # Calculate the length to truncate to (max length - end_text length)
-        truncate_len = length
-        for char in add_truncation_text("", truncate):
-            if not unicodedata.combining(char):
-                truncate_len -= 1
-                if truncate_len == 0:
-                    break
         if html:
-            return self._truncate_html(length, truncate, text, truncate_len, False)
-        return self._text_chars(length, truncate, text, truncate_len)
+            parser = TruncateCharsHTMLParser(length=length, replacement=truncate)
+            parser.feed(text)
+            parser.close()
+            return parser.output
+        return self._text_chars(length, truncate, text)
 
-    def _text_chars(self, length, truncate, text, truncate_len):
+    def _text_chars(self, length, truncate, text):
         """Truncate a string after a certain number of chars."""
+        truncate_len = calculate_truncate_chars_length(length, truncate)
         s_len = 0
         end_index = None
         for i, char in enumerate(text):
@@ -149,7 +249,10 @@ class Truncator(SimpleLazyObject):
         if length <= 0:
             return ""
         if html:
-            return self._truncate_html(length, truncate, self._wrapped, length, True)
+            parser = TruncateWordsHTMLParser(length=length, replacement=truncate)
+            parser.feed(self._wrapped)
+            parser.close()
+            return parser.output
         return self._text_words(length, truncate)
 
     def _text_words(self, length, truncate):
@@ -164,94 +267,6 @@ class Truncator(SimpleLazyObject):
             return add_truncation_text(" ".join(words), truncate)
         return " ".join(words)
 
-    def _truncate_html(self, length, truncate, text, truncate_len, words):
-        """
-        Truncate HTML to a certain number of chars (not counting tags and
-        comments), or, if words is True, then to a certain number of words.
-        Close opened tags if they were correctly closed in the given HTML.
-
-        Preserve newlines in the HTML.
-        """
-        if words and length <= 0:
-            return ""
-
-        size_limited = False
-        if len(text) > self.MAX_LENGTH_HTML:
-            text = text[: self.MAX_LENGTH_HTML]
-            size_limited = True
-
-        html4_singlets = (
-            "br",
-            "col",
-            "link",
-            "base",
-            "img",
-            "param",
-            "area",
-            "hr",
-            "input",
-        )
-
-        # Count non-HTML chars/words and keep note of open tags
-        pos = 0
-        end_text_pos = 0
-        current_len = 0
-        open_tags = []
-
-        regex = re_words if words else re_chars
-
-        while current_len <= length:
-            m = regex.search(text, pos)
-            if not m:
-                # Checked through whole string
-                break
-            pos = m.end(0)
-            if m[1]:
-                # It's an actual non-HTML word or char
-                current_len += 1
-                if current_len == truncate_len:
-                    end_text_pos = pos
-                continue
-            # Check for tag
-            tag = re_tag.match(m[0])
-            if not tag or current_len >= truncate_len:
-                # Don't worry about non tags or tags after our truncate point
-                continue
-            closing_tag, tagname, self_closing = tag.groups()
-            # Element names are always case-insensitive
-            tagname = tagname.lower()
-            if self_closing or tagname in html4_singlets:
-                pass
-            elif closing_tag:
-                # Check for match in open tags list
-                try:
-                    i = open_tags.index(tagname)
-                except ValueError:
-                    pass
-                else:
-                    # SGML: An end tag closes, back to the matching start tag,
-                    # all unclosed intervening start tags with omitted end tags
-                    open_tags = open_tags[i + 1 :]
-            else:
-                # Add it to the start of the open tags list
-                open_tags.insert(0, tagname)
-
-        truncate_text = add_truncation_text("", truncate)
-
-        if current_len <= length:
-            if size_limited and truncate_text:
-                text += truncate_text
-            return text
-
-        out = text[:end_text_pos]
-        if truncate_text:
-            out += truncate_text
-        # Close any tags still open
-        for tag in open_tags:
-            out += "</%s>" % tag
-        # Return string
-        return out
-
 
 @keep_lazy_text
 def get_valid_filename(name):
diff --git a/docs/releases/5.1.txt b/docs/releases/5.1.txt
index 701d686532..aca1281a98 100644
--- a/docs/releases/5.1.txt
+++ b/docs/releases/5.1.txt
@@ -368,6 +368,11 @@ Miscellaneous
   :meth:`~django.test.SimpleTestCase.assertInHTML` now add ``": "`` to the
   ``msg_prefix``. This is consistent with the behavior of other assertions.
 
+* ``django.utils.text.Truncator`` used by :tfilter:`truncatechars_html` and
+  :tfilter:`truncatewords_html` template filters now uses
+  :py:class:`html.parser.HTMLParser` subclasses. This results in a more robust
+  and faster operation, but there may be small differences in the output.
+
 .. _deprecated-features-5.1:
 
 Features deprecated in 5.1
diff --git a/tests/template_tests/filter_tests/test_truncatewords_html.py b/tests/template_tests/filter_tests/test_truncatewords_html.py
index 32b7c81a76..0cf41d83ae 100644
--- a/tests/template_tests/filter_tests/test_truncatewords_html.py
+++ b/tests/template_tests/filter_tests/test_truncatewords_html.py
@@ -24,7 +24,7 @@ class FunctionTests(SimpleTestCase):
             truncatewords_html(
                 '<p>one <a href="#">two - three <br>four</a> five</p>', 4
             ),
-            '<p>one <a href="#">two - three …</a></p>',
+            '<p>one <a href="#">two - three <br> …</a></p>',
         )
 
     def test_truncate3(self):
@@ -32,7 +32,7 @@ class FunctionTests(SimpleTestCase):
             truncatewords_html(
                 '<p>one <a href="#">two - three <br>four</a> five</p>', 5
             ),
-            '<p>one <a href="#">two - three <br>four …</a></p>',
+            '<p>one <a href="#">two - three <br>four</a> …</p>',
         )
 
     def test_truncate4(self):
@@ -53,7 +53,7 @@ class FunctionTests(SimpleTestCase):
             truncatewords_html(
                 "<i>Buenos d&iacute;as! &#x00bf;C&oacute;mo est&aacute;?</i>", 3
             ),
-            "<i>Buenos d&iacute;as! &#x00bf;C&oacute;mo …</i>",
+            "<i>Buenos días! ¿Cómo …</i>",
         )
 
     def test_invalid_arg(self):
diff --git a/tests/utils_tests/test_text.py b/tests/utils_tests/test_text.py
index 6004712bf2..b38d8238c5 100644
--- a/tests/utils_tests/test_text.py
+++ b/tests/utils_tests/test_text.py
@@ -111,7 +111,7 @@ class TestUtilsText(SimpleTestCase):
             truncator.chars(46, html=True),
         )
         self.assertEqual(
-            '<p id="par"><strong><em>The quick brown fox jumped over the lazy dog.</em>'
+            '<p id="par"><strong><em>The quick brown fox jumped over the lazy dog…</em>'
             "</strong></p>",
             truncator.chars(45, html=True),
         )
@@ -120,7 +120,7 @@ class TestUtilsText(SimpleTestCase):
             truncator.chars(10, html=True),
         )
         self.assertEqual(
-            "…",
+            '<p id="par"><strong><em>…</em></strong></p>',
             truncator.chars(1, html=True),
         )
         self.assertEqual("", truncator.chars(0, html=True))
@@ -142,18 +142,16 @@ class TestUtilsText(SimpleTestCase):
         bigger_len = text.Truncator.MAX_LENGTH_HTML + 1
         valid_html = "<p>Joel is a slug</p>"  # 14 chars
         perf_test_values = [
-            ("</a" + "\t" * (max_len - 6) + "//>", None),
-            ("</p" + "\t" * bigger_len + "//>", "</p" + "\t" * 6 + "…"),
-            ("&" * bigger_len, "&" * 9 + "…"),
-            ("_X<<<<<<<<<<<>", None),
+            ("</a" + "\t" * (max_len - 6) + "//>", "</a>"),
+            ("</p" + "\t" * bigger_len + "//>", "</p>"),
+            ("&" * bigger_len, ""),
+            ("_X<<<<<<<<<<<>", "_X&lt;&lt;&lt;&lt;&lt;&lt;&lt;…"),
             (valid_html * bigger_len, "<p>Joel is a…</p>"),  # 10 chars
         ]
         for value, expected in perf_test_values:
             with self.subTest(value=value):
                 truncator = text.Truncator(value)
-                self.assertEqual(
-                    expected if expected else value, truncator.chars(10, html=True)
-                )
+                self.assertEqual(expected, truncator.chars(10, html=True))
 
     def test_truncate_chars_html_with_newline_inside_tag(self):
         truncator = text.Truncator(
@@ -181,7 +179,7 @@ class TestUtilsText(SimpleTestCase):
             "<br>The <hr/>quick <em>brown…</em>", truncator.chars(16, html=True)
         )
         self.assertEqual("<br>The <hr/>q…", truncator.chars(6, html=True))
-        self.assertEqual("<br>The …", truncator.chars(5, html=True))
+        self.assertEqual("<br>The <hr/>…", truncator.chars(5, html=True))
         self.assertEqual("<br>The…", truncator.chars(4, html=True))
         self.assertEqual("<br>Th…", truncator.chars(3, html=True))
 
@@ -190,11 +188,19 @@ class TestUtilsText(SimpleTestCase):
             "<i>Buenos d&iacute;as! &#x00bf;C&oacute;mo est&aacute;?</i>"
         )
         self.assertEqual(
-            "<i>Buenos d&iacute;as! &#x00bf;C&oacute;mo…</i>",
+            "<i>Buenos días! ¿Cómo está?</i>",
             truncator.chars(40, html=True),
         )
+        self.assertEqual(
+            "<i>Buenos días…</i>",
+            truncator.chars(12, html=True),
+        )
+        self.assertEqual(
+            "<i>Buenos días! ¿Cómo está…</i>",
+            truncator.chars(24, html=True),
+        )
         truncator = text.Truncator("<p>I &lt;3 python, what about you?</p>")
-        self.assertEqual("<p>I &lt;3 python,…</p>", truncator.chars(16, html=True))
+        self.assertEqual("<p>I &lt;3 python, wh…</p>", truncator.chars(16, html=True))
 
     def test_truncate_words(self):
         truncator = text.Truncator("The quick brown fox jumped over the lazy dog.")
@@ -242,7 +248,7 @@ class TestUtilsText(SimpleTestCase):
             "<p>The  quick \t brown fox jumped over the lazy dog.</p>"
         )
         self.assertEqual(
-            "<p>The  quick \t brown fox…</p>",
+            "<p>The quick brown fox…</p>",
             truncator.words(4, html=True),
         )
 
@@ -277,7 +283,7 @@ class TestUtilsText(SimpleTestCase):
             "<i>Buenos d&iacute;as! &#x00bf;C&oacute;mo est&aacute;?</i>"
         )
         self.assertEqual(
-            "<i>Buenos d&iacute;as! &#x00bf;C&oacute;mo…</i>",
+            "<i>Buenos días! ¿Cómo…</i>",
             truncator.words(3, html=True),
         )
         truncator = text.Truncator("<p>I &lt;3 python, what about you?</p>")
@@ -292,19 +298,17 @@ class TestUtilsText(SimpleTestCase):
         bigger_len = text.Truncator.MAX_LENGTH_HTML + 1
         valid_html = "<p>Joel is a slug</p>"  # 4 words
         perf_test_values = [
-            ("</a" + "\t" * (max_len - 6) + "//>", None),
-            ("</p" + "\t" * bigger_len + "//>", "</p" + "\t" * (max_len - 3) + "…"),
-            ("&" * max_len, None),  # no change
-            ("&" * bigger_len, "&" * max_len + "…"),
-            ("_X<<<<<<<<<<<>", None),
+            ("</a" + "\t" * (max_len - 6) + "//>", "</a>"),
+            ("</p" + "\t" * bigger_len + "//>", "</p>"),
+            ("&" * max_len, ""),
+            ("&" * bigger_len, ""),
+            ("_X<<<<<<<<<<<>", "_X&lt;&lt;&lt;&lt;&lt;&lt;&lt;&lt;&lt;&lt;&lt;&gt;"),
             (valid_html * bigger_len, valid_html * 12 + "<p>Joel is…</p>"),  # 50 words
         ]
         for value, expected in perf_test_values:
             with self.subTest(value=value):
                 truncator = text.Truncator(value)
-                self.assertEqual(
-                    expected if expected else value, truncator.words(50, html=True)
-                )
+                self.assertEqual(expected, truncator.words(50, html=True))
 
     def test_wrap(self):
         digits = "1234 67 9"
author	David Smith <smithdc@gmail.com>	2023-01-03 20:48:06 +0000
committer	Mariusz Felisiak <felisiak.mariusz@gmail.com>	2024-02-07 09:46:25 +0100
commit	6ee37ada3241ed263d8d1c2901b030d964cbd161 (patch)
tree	b37d4c173f5a7621e9304055875eca8d1939a069
parent	70f39e46f86b946c273340d52109824c776ffb4c (diff)