docs/utils.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152

import re
import unicodedata
from html.parser import HTMLParser
from pathlib import Path

from django.conf import settings
from django.http import Http404


def get_doc_root(lang, version, builder="json"):
    return settings.DOCS_BUILD_ROOT / lang / version / "_built" / builder


def get_doc_root_or_404(lang, version, builder="json"):
    docroot = get_doc_root(lang, version, builder)
    if not docroot.exists():
        raise Http404(str(docroot))
    return docroot


def get_doc_path(docroot, subpath):
    # First look for <bits>/index.fjson, then for <bits>.fjson
    try:
        bits = subpath.strip("/").split("/") + ["index.fjson"]
    except AttributeError:
        bits = []
    doc = docroot / Path(*bits)
    try:
        if doc.exists():
            return doc
    except NotADirectoryError:
        pass  # we get here if doc + subpath (without /index.fjson) is a file

    bits = bits[:-2] + ["%s.fjson" % bits[-2]]
    doc = docroot / Path(*bits)
    if doc.exists():
        return doc

    return None


def get_doc_path_or_404(docroot, subpath):
    doc = get_doc_path(docroot, subpath)
    if doc is None:
        raise Http404(doc)
    return doc


def sanitize_for_trigram(text):
    """
    Sanitize search query for PostgreSQL Trigram search.

    - Removes parts starting with '-'
    - Normalizes Unicode characters (NFKD)
    - Keeps only letters, numbers and spaces
    - Removes multiple spaces and trims
    """
    text = re.sub(r'(\s|^)-[^\s"\']+|(\s|^)-["\'][^"\']+["\']', "", text)
    text = unicodedata.normalize("NFKD", text)
    text = re.sub(r"[^\w\s]", "", text, flags=re.UNICODE)
    return " ".join(text.split())


def get_module_path(name, full_path):
    """
    Checks if the `full_path` ends with `.name` and, if so, removes it to return
    the module path. Otherwise, it returns `None`.

    Args:
        name (str):
            The short name of the object (e.g., `"QuerySet.select_related"`).
        full_path (str):
            The full path of the object (e.g.,
            `"django.db.models.query.QuerySet.select_related"`).

    Returns:
        str or None:
            The module path if `full_path` ends with `.name`, otherwise `None`.

    Example:
        >>> get_module_path(
        ...   "QuerySet.select_related",
        ...   "django.db.models.query.QuerySet.select_related"
        ... )
        'django.db.models.query'

        >>> get_module_path("Model", "django.db.models.Model")
        'django.db.models'

        >>> get_module_path("django", "django")
        None
    """
    name_suffix = f".{name}"
    if full_path.endswith(name_suffix):
        return full_path.removesuffix(name_suffix)
    return None


class SingleTagInnerHTMLExtractor(HTMLParser):
    def __init__(self, target_tag):
        super().__init__()
        self.target_tag = target_tag.lower()
        self.capturing = False
        self.inner_html = []
        self.tag_count = 0

    def handle_starttag(self, tag, attrs):
        tag = tag.lower()
        if tag == self.target_tag:
            self.tag_count += 1
            if self.capturing:
                # Nested target tag not allowed.
                raise ValueError(f"Nested <{self.target_tag}> tags are not allowed.")
            self.capturing = True
        elif self.capturing:
            self.inner_html.append(self.get_starttag_text())

    def handle_endtag(self, tag):
        tag = tag.lower()
        if self.capturing:
            if tag == self.target_tag:
                self.capturing = False
            else:
                self.inner_html.append(f"</{tag}>")

    def handle_data(self, data):
        if self.capturing:
            self.inner_html.append(data)

    def handle_entityref(self, name):
        if self.capturing:
            self.inner_html.append(f"&{name};")

    def handle_charref(self, name):
        if self.capturing:
            self.inner_html.append(f"&#{name};")


def extract_inner_html(html, tag):
    """
    Extracts the inner HTML of a tag that appears exactly once.
    """
    parser = SingleTagInnerHTMLExtractor(tag)
    parser.feed(html)
    parser.close()

    if parser.tag_count == 0:
        raise ValueError(f"<{tag}> not found in HTML.")
    if parser.tag_count > 1:
        raise ValueError(f"<{tag}> occurs more than once in HTML.")

    return "".join(parser.inner_html)