[3.2.x] Fixed CVE-2024-27351 -- Prevented potential ReDoS in Truncator.words().

Thanks Seokchan Yoon for the report. Co-Authored-By: Mariusz Felisiak <felisiak.mariusz@gmail.com>
author: Shai Berger <shai@platonix.com> 2024-02-19 13:56:37 +0100
committer: Mariusz Felisiak <felisiak.mariusz@gmail.com> 2024-03-04 08:37:38 +0100
commit: 072963e4c4d0b3a7a8c5412bc0c7d27d1a9c3521 (patch)
tree: e3f4ce3b321797535984d8d75c6bb82f432bfdc9
parent: 2ad2676456316eb211104d1f0cfc8dea7a7ca76b (diff)
3 files changed, 89 insertions, 2 deletions
diff --git a/django/utils/text.py b/django/utils/text.py
index 83e258fa81..88da9a2c2c 100644
--- a/django/utils/text.py
+++ b/django/utils/text.py
@@ -18,8 +18,61 @@ def capfirst(x):
     return x and str(x)[0].upper() + str(x)[1:]
 
 
-# Set up regular expressions
-re_words = _lazy_re_compile(r'<[^>]+?>|([^<>\s]+)', re.S)
+# ----- Begin security-related performance workaround -----
+
+# We used to have, below
+#
+# re_words = _lazy_re_compile(r"<[^>]+?>|([^<>\s]+)", re.S)
+#
+# But it was shown that this regex, in the way we use it here, has some
+# catastrophic edge-case performance features. Namely, when it is applied to
+# text with only open brackets "<<<...". The class below provides the services
+# and correct answers for the use cases, but in these edge cases does it much
+# faster.
+re_notag = _lazy_re_compile(r"([^<>\s]+)", re.S)
+re_prt = _lazy_re_compile(r"<|([^<>\s]+)", re.S)
+
+
+class WordsRegex:
+    @staticmethod
+    def search(text, pos):
+        # Look for "<" or a non-tag word.
+        partial = re_prt.search(text, pos)
+        if partial is None or partial[1] is not None:
+            return partial
+
+        # "<" was found, look for a closing ">".
+        end = text.find(">", partial.end(0))
+        if end < 0:
+            # ">" cannot be found, look for a word.
+            return re_notag.search(text, pos + 1)
+        else:
+            # "<" followed by a ">" was found -- fake a match.
+            end += 1
+            return FakeMatch(text[partial.start(0): end], end)
+
+
+class FakeMatch:
+    __slots__ = ["_text", "_end"]
+
+    def end(self, group=0):
+        assert group == 0, "This specific object takes only group=0"
+        return self._end
+
+    def __getitem__(self, group):
+        if group == 1:
+            return None
+        assert group == 0, "This specific object takes only group in {0,1}"
+        return self._text
+
+    def __init__(self, text, end):
+        self._text, self._end = text, end
+
+
+# ----- End security-related performance workaround -----
+
+# Set up regular expressions.
+re_words = WordsRegex
 re_chars = _lazy_re_compile(r'<[^>]+?>|(.)', re.S)
 re_tag = _lazy_re_compile(r'<(/)?(\S+?)(?:(\s*/)|\s.*?)?>', re.S)
 re_newlines = _lazy_re_compile(r'\r\n|\r')  # Used in normalize_newlines
diff --git a/docs/releases/3.2.25.txt b/docs/releases/3.2.25.txt
index aa81c720d5..a3a90986ff 100644
--- a/docs/releases/3.2.25.txt
+++ b/docs/releases/3.2.25.txt
@@ -7,6 +7,14 @@ Django 3.2.25 release notes
 Django 3.2.25 fixes a security issue with severity "moderate" and a regression
 in 3.2.24.
 
+CVE-2024-27351: Potential regular expression denial-of-service in ``django.utils.text.Truncator.words()``
+=========================================================================================================
+
+``django.utils.text.Truncator.words()`` method (with ``html=True``) and
+:tfilter:`truncatewords_html` template filter were subject to a potential
+regular expression denial-of-service attack using a suitably crafted string
+(follow up to :cve:`2019-14232` and :cve:`2023-43665`).
+
 Bugfixes
 ========
 
diff --git a/tests/utils_tests/test_text.py b/tests/utils_tests/test_text.py
index 0a6f0bc3f2..758919c66e 100644
--- a/tests/utils_tests/test_text.py
+++ b/tests/utils_tests/test_text.py
@@ -159,6 +159,32 @@ class TestUtilsText(SimpleTestCase):
         truncator = text.Truncator('<p>I &lt;3 python, what about you?</p>')
         self.assertEqual('<p>I &lt;3 python,…</p>', truncator.words(3, html=True))
 
+        # Only open brackets.
+        test = "<" * 60_000
+        truncator = text.Truncator(test)
+        self.assertEqual(truncator.words(1, html=True), test)
+
+        # Tags with special chars in attrs.
+        truncator = text.Truncator(
+            """<i style="margin: 5%; font: *;">Hello, my dear lady!</i>"""
+        )
+        self.assertEqual(
+            """<i style="margin: 5%; font: *;">Hello, my dear…</i>""",
+            truncator.words(3, html=True),
+        )
+
+        # Tags with special non-latin chars in attrs.
+        truncator = text.Truncator("""<p data-x="א">Hello, my dear lady!</p>""")
+        self.assertEqual(
+            """<p data-x="א">Hello, my dear…</p>""",
+            truncator.words(3, html=True),
+        )
+
+        # Misplaced brackets.
+        truncator = text.Truncator("hello >< world")
+        self.assertEqual(truncator.words(1, html=True), "hello…")
+        self.assertEqual(truncator.words(2, html=True), "hello >< world")
+
     @patch("django.utils.text.Truncator.MAX_LENGTH_HTML", 10_000)
     def test_truncate_words_html_size_limit(self):
         max_len = text.Truncator.MAX_LENGTH_HTML
author	Shai Berger <shai@platonix.com>	2024-02-19 13:56:37 +0100
committer	Mariusz Felisiak <felisiak.mariusz@gmail.com>	2024-03-04 08:37:38 +0100
commit	072963e4c4d0b3a7a8c5412bc0c7d27d1a9c3521 (patch)
tree	e3f4ce3b321797535984d8d75c6bb82f432bfdc9
parent	2ad2676456316eb211104d1f0cfc8dea7a7ca76b (diff)