import re import unicodedata from html.parser import HTMLParser from pathlib import Path from django.conf import settings from django.http import Http404 def get_doc_root(lang, version, builder="json"): return settings.DOCS_BUILD_ROOT / lang / version / "_built" / builder def get_doc_root_or_404(lang, version, builder="json"): docroot = get_doc_root(lang, version, builder) if not docroot.exists(): raise Http404(str(docroot)) return docroot def get_doc_path(docroot, subpath): # First look for /index.fjson, then for .fjson try: bits = subpath.strip("/").split("/") + ["index.fjson"] except AttributeError: bits = [] doc = docroot / Path(*bits) try: if doc.exists(): return doc except NotADirectoryError: pass # we get here if doc + subpath (without /index.fjson) is a file bits = bits[:-2] + ["%s.fjson" % bits[-2]] doc = docroot / Path(*bits) if doc.exists(): return doc return None def get_doc_path_or_404(docroot, subpath): doc = get_doc_path(docroot, subpath) if doc is None: raise Http404(doc) return doc def sanitize_for_trigram(text): """ Sanitize search query for PostgreSQL Trigram search. - Removes parts starting with '-' - Normalizes Unicode characters (NFKD) - Keeps only letters, numbers and spaces - Removes multiple spaces and trims """ text = re.sub(r'(\s|^)-[^\s"\']+|(\s|^)-["\'][^"\']+["\']', "", text) text = unicodedata.normalize("NFKD", text) text = re.sub(r"[^\w\s]", "", text, flags=re.UNICODE) return " ".join(text.split()) def get_module_path(name, full_path): """ Checks if the `full_path` ends with `.name` and, if so, removes it to return the module path. Otherwise, it returns `None`. Args: name (str): The short name of the object (e.g., `"QuerySet.select_related"`). full_path (str): The full path of the object (e.g., `"django.db.models.query.QuerySet.select_related"`). Returns: str or None: The module path if `full_path` ends with `.name`, otherwise `None`. Example: >>> get_module_path( ... "QuerySet.select_related", ... "django.db.models.query.QuerySet.select_related" ... ) 'django.db.models.query' >>> get_module_path("Model", "django.db.models.Model") 'django.db.models' >>> get_module_path("django", "django") None """ name_suffix = f".{name}" if full_path.endswith(name_suffix): return full_path.removesuffix(name_suffix) return None class SingleTagInnerHTMLExtractor(HTMLParser): def __init__(self, target_tag): super().__init__() self.target_tag = target_tag.lower() self.capturing = False self.inner_html = [] self.tag_count = 0 def handle_starttag(self, tag, attrs): tag = tag.lower() if tag == self.target_tag: self.tag_count += 1 if self.capturing: # Nested target tag not allowed. raise ValueError(f"Nested <{self.target_tag}> tags are not allowed.") self.capturing = True elif self.capturing: self.inner_html.append(self.get_starttag_text()) def handle_endtag(self, tag): tag = tag.lower() if self.capturing: if tag == self.target_tag: self.capturing = False else: self.inner_html.append(f"") def handle_data(self, data): if self.capturing: self.inner_html.append(data) def handle_entityref(self, name): if self.capturing: self.inner_html.append(f"&{name};") def handle_charref(self, name): if self.capturing: self.inner_html.append(f"&#{name};") def extract_inner_html(html, tag): """ Extracts the inner HTML of a tag that appears exactly once. """ parser = SingleTagInnerHTMLExtractor(tag) parser.feed(html) parser.close() if parser.tag_count == 0: raise ValueError(f"<{tag}> not found in HTML.") if parser.tag_count > 1: raise ValueError(f"<{tag}> occurs more than once in HTML.") return "".join(parser.inner_html)