summaryrefslogtreecommitdiff
path: root/docs
diff options
context:
space:
mode:
authorSarah Boyce <42296566+sarahboyce@users.noreply.github.com>2026-03-23 14:32:13 +0100
committerGitHub <noreply@github.com>2026-03-23 14:32:13 +0100
commit004b788e60093488ea396d9615007b83faecd39a (patch)
tree4ea781073340e606c98019d6a07f2a06ede19016 /docs
parentf61fb79492a77be93cc88923c12478614e06e004 (diff)
Synced webpages from sitemap to Document model for search. (#2284)
* Synced webpages from sitemap to Document model for search. * Removed is_searchable from blog Entry model.
Diffstat (limited to 'docs')
-rw-r--r--docs/management/commands/update_docs.py2
-rw-r--r--docs/models.py112
-rw-r--r--docs/search.py40
-rw-r--r--docs/sitemaps.py23
-rw-r--r--docs/tests/test_models.py198
-rw-r--r--docs/tests/test_utils.py38
-rw-r--r--docs/utils.py57
7 files changed, 321 insertions, 149 deletions
diff --git a/docs/management/commands/update_docs.py b/docs/management/commands/update_docs.py
index acc0069b..b5b01131 100644
--- a/docs/management/commands/update_docs.py
+++ b/docs/management/commands/update_docs.py
@@ -134,6 +134,8 @@ class Command(BaseCommand):
if self.verbosity >= 1:
self.stdout.write(f"Starting update for {release} at {datetime.now()}...")
+ release.sync_from_sitemap(force=force)
+
# checkout_dir is shared for all languages.
checkout_dir = settings.DOCS_BUILD_ROOT / "sources" / release.version
parent_build_dir = settings.DOCS_BUILD_ROOT / release.lang / release.version
diff --git a/docs/models.py b/docs/models.py
index b5ad2870..8dd7268c 100644
--- a/docs/models.py
+++ b/docs/models.py
@@ -1,6 +1,7 @@
import datetime
import html
import json
+import logging
import operator
from functools import partial, reduce
from pathlib import Path
@@ -26,19 +27,21 @@ from django.utils.functional import cached_property
from django.utils.html import strip_tags
from django_hosts.resolvers import reverse
-from blog.models import Entry
from releases.models import Release
from . import utils
from .search import (
DEFAULT_TEXT_SEARCH_CONFIG,
- SEARCHABLE_VIEWS,
START_SEL,
STOP_SEL,
TSEARCH_CONFIG_LANGUAGES,
DocumentationCategory,
+ fetch_html,
get_document_search_vector,
)
+from .utils import extract_inner_html
+
+logger = logging.getLogger(__name__)
def get_search_config(lang):
@@ -185,7 +188,7 @@ class DocumentRelease(models.Model):
the database. Deletes all the release's documents first then
reinserts them as needed.
"""
- self.documents.all().delete()
+ self.documents.exclude(metadata__parents=DocumentationCategory.WEBSITE).delete()
# Read excluded paths from robots.docs.txt.
robots_path = settings.BASE_DIR / "djangoproject" / "static" / "robots.docs.txt"
@@ -216,65 +219,52 @@ class DocumentRelease(models.Model):
metadata=document,
config=get_search_config(self.lang),
)
- for document in self.documents.all():
+ for document in self.documents.exclude(
+ metadata__parents=DocumentationCategory.WEBSITE
+ ):
document.metadata["breadcrumbs"] = list(
Document.objects.breadcrumbs(document).values("title", "path")
)
document.save(update_fields=("metadata",))
- self._sync_blog_to_db()
- self._sync_views_to_db()
+ def sync_from_sitemap(self, force=False):
+ from djangoproject.urls.www import sitemaps
- def _sync_blog_to_db(self):
- """
- Sync the blog entries into search based on the release documents
- support end date.
- """
- if self.lang != "en":
- return # The blog is only written in English currently
+ if not self.is_dev:
+ return
- entries = Entry.objects.published().searchable()
- Document.objects.bulk_create(
- [
- Document(
- release=self,
- path=entry.get_absolute_url(),
- title=entry.headline,
- metadata={
- "body": entry.body_html,
- "breadcrumbs": [
- {
- "path": DocumentationCategory.WEBSITE,
- "title": "News",
- },
- ],
- "parents": DocumentationCategory.WEBSITE,
- "slug": entry.slug,
- "title": entry.headline,
- "toc": "",
- },
- config=get_search_config(self.lang),
- )
- for entry in entries
- ]
- )
+ if force:
+ Document.objects.filter(
+ metadata__parents=DocumentationCategory.WEBSITE
+ ).delete()
- def _sync_views_to_db(self):
- """
- Sync the specific views into search based on the release documents
- support end date.
- """
- if self.lang != "en":
- return # The searchable views are only written in English currently
+ doc_urls = set(
+ Document.objects.filter(
+ metadata__parents=DocumentationCategory.WEBSITE
+ ).values_list("path", flat=True)
+ )
- Document.objects.bulk_create(
- [
- Document(
+ for sitemap in sitemaps.values():
+ for url in sitemap().get_urls():
+ path = url["location"]
+ if path in doc_urls:
+ continue
+ try:
+ page_html = fetch_html(path)
+ except ValueError:
+ logger.exception("Error indexing template view for search")
+ continue
+ try:
+ main_html = extract_inner_html(page_html, tag="main")
+ title = extract_inner_html(page_html, tag="h1")
+ except ValueError:
+ continue
+ Document.objects.create(
release=self,
- path=searchable_view.www_absolute_url,
- title=searchable_view.page_title,
+ path=path,
+ title=title,
metadata={
- "body": searchable_view.html,
+ "body": main_html,
"breadcrumbs": [
{
"path": DocumentationCategory.WEBSITE,
@@ -282,15 +272,11 @@ class DocumentRelease(models.Model):
},
],
"parents": DocumentationCategory.WEBSITE,
- "slug": searchable_view.url_name,
- "title": searchable_view.page_title,
+ "title": title,
"toc": "",
},
config=get_search_config(self.lang),
)
- for searchable_view in SEARCHABLE_VIEWS
- ]
- )
def _clean_document_path(path):
@@ -351,6 +337,20 @@ class DocumentQuerySet(models.QuerySet):
config=models.F("config"),
)
base_filter = Q(release_id=release.id)
+ if release.lang == settings.DEFAULT_LANGUAGE_CODE and not release.is_dev:
+ # Fetch the "dev" release explicitly so we can filter by release_id.
+ # This avoids JOINs in the main query and leverages the indexed FK,
+ # which is more efficient than filtering by release__release__version
+ # and release__lang.
+ dev_release = DocumentRelease.objects.get_by_version_and_lang(
+ version="dev", lang=settings.DEFAULT_LANGUAGE_CODE
+ )
+ # Website content (non-docs content) is associated with the "dev"
+ # release. This is included in the search results.
+ base_filter |= Q(
+ release_id=dev_release.id,
+ metadata__parents=DocumentationCategory.WEBSITE,
+ )
if document_category:
base_filter &= Q(metadata__parents__startswith=document_category)
base_qs = (
diff --git a/docs/search.py b/docs/search.py
index 0b7eaef1..4a779a75 100644
--- a/docs/search.py
+++ b/docs/search.py
@@ -1,11 +1,10 @@
-from dataclasses import dataclass
+from urllib.parse import urlparse
from django.contrib.postgres.search import SearchVector
from django.db.models import TextChoices
from django.db.models.fields.json import KeyTextTransform
-from django.template.loader import get_template
+from django.test import Client
from django.utils.translation import gettext_lazy as _
-from django_hosts import reverse
# Imported from
# https://github.com/postgres/postgres/blob/REL_14_STABLE/src/bin/initdb/initdb.c#L659
@@ -81,25 +80,16 @@ class DocumentationCategory(TextChoices):
return None
-@dataclass
-class SearchableView:
- page_title: str
- url_name: str
- template: str
-
- @property
- def html(self):
- return get_template(self.template).render()
-
- @property
- def www_absolute_url(self):
- return reverse(self.url_name, host="www")
-
-
-SEARCHABLE_VIEWS = [
- SearchableView(
- page_title="Django's Ecosystem",
- url_name="community-ecosystem",
- template="aggregator/ecosystem.html",
- ),
-]
+def fetch_html(url):
+ parsed = urlparse(url)
+ headers = {"HOST": parsed.netloc} # Use netloc to include the port.
+ client = Client(headers=headers, raise_request_exception=False)
+ response = client.get(parsed.path)
+ content_type = response.headers.get("Content-Type", "")
+ if response.status_code == 200 and "text/html" in content_type:
+ return response.text
+ raise ValueError(
+ f"Failed to fetch {url}, "
+ f"status code: {response.status_code}, "
+ f"Content-Type: {content_type}"
+ )
diff --git a/docs/sitemaps.py b/docs/sitemaps.py
index 6f520aa7..c46ec093 100644
--- a/docs/sitemaps.py
+++ b/docs/sitemaps.py
@@ -1,10 +1,12 @@
from django.contrib.sitemaps import Sitemap
+from djangoproject.sitemaps import LocationAbsoluteUrlMixin
+
from .models import Document
from .search import DocumentationCategory
-class DocsSitemap(Sitemap):
+class DocsSitemap(LocationAbsoluteUrlMixin, Sitemap):
def __init__(self, lang):
self.lang = lang
@@ -16,6 +18,9 @@ class DocsSitemap(Sitemap):
.select_related("release__release")
)
+ def location(self, item):
+ return item.get_absolute_url()
+
def changefreq(self, obj):
return "daily"
@@ -33,19 +38,3 @@ class DocsSitemap(Sitemap):
return 1
else:
return 0.1
-
- def _urls(self, page, site, protocol):
- # XXX: To workaround bad interaction between contrib.sitemaps and
- # django-hosts (scheme/domain would be repeated twice in URLs)
- urls = []
- for item in self.paginator.page(page).object_list:
- loc = item.get_absolute_url()
- priority = self.priority(item)
- url_info = {
- "item": item,
- "location": loc,
- "changefreq": self.changefreq(item),
- "priority": str(priority if priority is not None else ""),
- }
- urls.append(url_info)
- return urls
diff --git a/docs/tests/test_models.py b/docs/tests/test_models.py
index a5e22065..1ad94281 100644
--- a/docs/tests/test_models.py
+++ b/docs/tests/test_models.py
@@ -5,9 +5,8 @@ from django.conf import settings
from django.db import connection
from django.test import TestCase
from django.utils import timezone
-from django_hosts import reverse
-from blog.models import Entry
+from blog.models import ContentFormat, Entry
from releases.models import Release
from ..models import Document, DocumentRelease
@@ -184,6 +183,9 @@ class ManagerTests(TestCase):
class DocumentManagerTest(TestCase):
@classmethod
def setUpTestData(cls):
+ cls.dev_release = DocumentRelease.objects.create(
+ lang=settings.DEFAULT_LANGUAGE_CODE
+ )
cls.release = DocumentRelease.objects.create(
release=Release.objects.create(version="1.2.3"),
)
@@ -358,6 +360,20 @@ class DocumentManagerTest(TestCase):
"release": cls.release_fr,
"title": "Notes de publication de Django 1.9.4",
},
+ {
+ "metadata": {
+ "body": "Main 1",
+ "breadcrumbs": [
+ {"path": DocumentationCategory.WEBSITE, "title": "Website"}
+ ],
+ "parents": DocumentationCategory.WEBSITE,
+ "title": "Title 1",
+ "toc": "",
+ },
+ "path": "example",
+ "release": cls.dev_release,
+ "title": "Blog post",
+ },
]
Document.objects.bulk_create(Document(**doc) for doc in documents)
@@ -457,28 +473,21 @@ class DocumentManagerTest(TestCase):
),
)
+ def test_website_document_items_included_english(self):
+ self.assertQuerySetEqual(
+ Document.objects.search("Main", self.release),
+ ["Blog post"],
+ transform=attrgetter("title"),
+ )
+
+ def test_website_document_items_excluded_non_english(self):
+ self.assertEqual(Document.objects.search("Main", self.release_fr).count(), 0)
+
class UpdateDocTests(TestCase):
@classmethod
def setUpTestData(cls):
- now = timezone.now()
- cls.release = DocumentRelease.objects.create(
- release=Release.objects.create(
- version="1.0.0",
- eol_date=now + datetime.timedelta(days=1),
- )
- )
- cls.entry = Entry.objects.create(
- pub_date=now,
- is_active=True,
- is_searchable=True,
- headline="Searchable post",
- slug="a",
- body_html="<h1>Searchable Blog Post</h1>",
- )
- cls.docs_documents = cls.release.documents.exclude(
- metadata__parents=DocumentationCategory.WEBSITE
- )
+ cls.release = DocumentRelease.objects.create(is_default=True)
def test_sync_to_db(self):
self.release.sync_to_db(
@@ -490,24 +499,8 @@ class UpdateDocTests(TestCase):
}
]
)
- self.assertQuerySetEqual(
- self.release.documents.all(),
- [
- "foo/bar",
- reverse("community-ecosystem", host="www"),
- self.entry.get_absolute_url(),
- ],
- ordered=False,
- transform=attrgetter("path"),
- )
-
- def test_sync_to_db_skip_non_english(self):
- """
- Releases must be English to include the blog and website results in search.
- """
- non_english = DocumentRelease.objects.create(lang="es")
- non_english.sync_to_db([])
- self.assertQuerySetEqual(non_english.documents.all(), [])
+ document = self.release.documents.get()
+ self.assertEqual(document.path, "foo/bar")
def test_clean_path(self):
self.release.sync_to_db(
@@ -519,7 +512,7 @@ class UpdateDocTests(TestCase):
}
]
)
- document = self.docs_documents.get()
+ document = self.release.documents.get()
self.assertEqual(document.path, "foo/bar")
def test_title_strip_tags(self):
@@ -533,7 +526,7 @@ class UpdateDocTests(TestCase):
]
)
self.assertQuerySetEqual(
- self.docs_documents.all(),
+ self.release.documents.all(),
["This is the title"],
transform=attrgetter("title"),
)
@@ -549,7 +542,7 @@ class UpdateDocTests(TestCase):
]
)
self.assertQuerySetEqual(
- self.docs_documents,
+ self.release.documents.all(),
["Title & title"],
transform=attrgetter("title"),
)
@@ -562,7 +555,7 @@ class UpdateDocTests(TestCase):
{"current_page_name": "foo/3"},
]
)
- self.assertQuerySetEqual(self.docs_documents, [])
+ self.assertQuerySetEqual(self.release.documents.all(), [])
def test_excluded_documents(self):
"""
@@ -590,6 +583,119 @@ class UpdateDocTests(TestCase):
document = release.documents.get()
self.assertEqual(document.path, "nonexcluded/bar")
+ def test_sync_to_db_not_delete_website_docs(self):
+ Document.objects.create(
+ release=self.release,
+ path="example_path",
+ title="Title 1",
+ metadata={
+ "body": "Main 1",
+ "breadcrumbs": [
+ {"path": DocumentationCategory.WEBSITE, "title": "Website"}
+ ],
+ "parents": DocumentationCategory.WEBSITE,
+ "title": "Title 1",
+ "toc": "",
+ },
+ )
+ self.release.sync_to_db([])
+ self.assertEqual(Document.objects.filter(release=self.release).count(), 1)
+
+ def test_sync_from_sitemap_skip_non_en_dev_release(self):
+ release = Release.objects.create(version="5.2")
+ blog_entry = Entry.objects.create(
+ pub_date=timezone.now() - datetime.timedelta(days=2),
+ slug="a",
+ body="<strong>test</strong>",
+ content_format=ContentFormat.HTML,
+ is_active=True,
+ )
+ for lang, release_obj in [
+ ("fr", release),
+ (settings.DEFAULT_LANGUAGE_CODE, release),
+ ]:
+ doc_release = DocumentRelease.objects.create(
+ lang=lang,
+ release=release_obj,
+ )
+ with self.subTest(lang=lang, release=release_obj):
+ doc_release.sync_from_sitemap()
+ self.assertFalse(
+ Document.objects.filter(path=blog_entry.get_absolute_url()).exists()
+ )
+
+ def test_sync_from_sitemap(self):
+ blog_entry = Entry.objects.create(
+ pub_date=timezone.now() - datetime.timedelta(days=2),
+ slug="a",
+ body="<strong>test</strong>",
+ headline="Title 1",
+ content_format=ContentFormat.HTML,
+ is_active=True,
+ )
+ self.release.sync_from_sitemap()
+
+ document = Document.objects.get(
+ release=self.release, path=blog_entry.get_absolute_url()
+ )
+ self.assertEqual(document.title, "Title 1")
+ self.assertIn("<strong>test</strong>", document.metadata["body"])
+ self.assertEqual(document.metadata["title"], "Title 1")
+ self.assertEqual(document.metadata["toc"], "")
+
+ def test_sync_from_sitemap_only_requests_non_existing(self):
+ blog_entry = Entry.objects.create(
+ pub_date=timezone.now() - datetime.timedelta(days=2),
+ slug="a",
+ body="<strong>test</strong>",
+ content_format=ContentFormat.HTML,
+ is_active=True,
+ )
+ Document.objects.create(
+ release=self.release,
+ metadata={"parents": DocumentationCategory.WEBSITE},
+ path=blog_entry.get_absolute_url(),
+ )
+ self.release.sync_from_sitemap()
+ document = Document.objects.get(
+ release=self.release, path=blog_entry.get_absolute_url()
+ )
+ # Confirm Document has not been updated.
+ self.assertEqual(
+ document.metadata,
+ {"parents": DocumentationCategory.WEBSITE},
+ )
+
+ def test_sync_from_sitemap_force(self):
+ Document.objects.create(
+ release=self.release,
+ metadata={"parents": DocumentationCategory.WEBSITE},
+ path="some_path",
+ )
+ blog_entry = Entry.objects.create(
+ pub_date=timezone.now() - datetime.timedelta(days=2),
+ slug="a",
+ body="<strong>test</strong>",
+ content_format=ContentFormat.HTML,
+ is_active=True,
+ headline="Title 1",
+ )
+ blog_url = blog_entry.get_absolute_url()
+ Document.objects.create(
+ release=self.release,
+ metadata={"parents": DocumentationCategory.WEBSITE},
+ path=blog_url,
+ )
+ self.release.sync_from_sitemap(force=True)
+
+ document = Document.objects.get(release=self.release, path=blog_url)
+ # Confirm Document has been updated.
+ self.assertEqual(document.path, blog_url)
+ self.assertEqual(document.title, "Title 1")
+ self.assertIn("<strong>test</strong>", document.metadata["body"])
+ self.assertEqual(document.metadata["title"], "Title 1")
+ self.assertEqual(document.metadata["toc"], "")
+
class DocumentUrlTests(TestCase):
@classmethod
@@ -623,11 +729,3 @@ class DocumentUrlTests(TestCase):
],
transform=lambda doc: doc.get_absolute_url(),
)
-
- def test_document_url_documentation_category_website(self):
- self.release._sync_views_to_db()
- document_view = self.release.documents.get()
- self.assertEqual(
- document_view.get_absolute_url(),
- "http://www.djangoproject.localhost:8000/community/ecosystem/",
- )
diff --git a/docs/tests/test_utils.py b/docs/tests/test_utils.py
index 4ab65002..f676540b 100644
--- a/docs/tests/test_utils.py
+++ b/docs/tests/test_utils.py
@@ -3,7 +3,7 @@ from pathlib import Path
from django.test import SimpleTestCase
-from ..utils import get_doc_path, sanitize_for_trigram
+from ..utils import extract_inner_html, get_doc_path, sanitize_for_trigram
class TestUtils(SimpleTestCase):
@@ -38,3 +38,39 @@ class TestUtils(SimpleTestCase):
]:
with self.subTest(query=query):
self.assertEqual(sanitize_for_trigram(query), sanitized_query)
+
+ def test_extract_inner_html(self):
+ for html, expected_output in [
+ ("<main><p>Hello</p></main>", "<p>Hello</p>"),
+ (
+ '<header>Test</header><main id="app" class="container">'
+ "<h1>Title</h1></main>",
+ "<h1>Title</h1>",
+ ),
+ ("<main>&amp; &lt; &gt; &#169;</main>", "& < > ©"),
+ ("<main></main>", ""),
+ ("<main>Hello world</main>", "Hello world"),
+ ("<main><h1>Hi</h1>Text<p>Bye</p></main>", "<h1>Hi</h1>Text<p>Bye</p>"),
+ ]:
+ with self.subTest(html=html):
+ self.assertEqual(extract_inner_html(html, tag="main"), expected_output)
+
+ def test_extract_inner_html_multiple_same_tags_raises(self):
+ with self.assertRaisesMessage(
+ ValueError, "<main> occurs more than once in HTML."
+ ):
+ extract_inner_html(
+ "<main>One main</main><main id='dupe'>Two main</main>", tag="main"
+ )
+
+ def test_extract_inner_html_multiple_same_tags_nested_raises(self):
+ with self.assertRaisesMessage(
+ ValueError, "Nested <main> tags are not allowed."
+ ):
+ extract_inner_html(
+ "<main>One main<main id='dupe'>Two main</main></main>", tag="main"
+ )
+
+ def test_extract_inner_html_tag_not_found_raises(self):
+ with self.assertRaisesMessage(ValueError, "<main> not found in HTML."):
+ extract_inner_html("<p>Test</p>", tag="main")
diff --git a/docs/utils.py b/docs/utils.py
index bbbf5790..973f684b 100644
--- a/docs/utils.py
+++ b/docs/utils.py
@@ -1,5 +1,6 @@
import re
import unicodedata
+from html.parser import HTMLParser
from pathlib import Path
from django.conf import settings
@@ -93,3 +94,59 @@ def get_module_path(name, full_path):
if full_path.endswith(name_suffix):
return full_path.removesuffix(name_suffix)
return None
+
+
+class SingleTagInnerHTMLExtractor(HTMLParser):
+ def __init__(self, target_tag):
+ super().__init__()
+ self.target_tag = target_tag.lower()
+ self.capturing = False
+ self.inner_html = []
+ self.tag_count = 0
+
+ def handle_starttag(self, tag, attrs):
+ tag = tag.lower()
+ if tag == self.target_tag:
+ self.tag_count += 1
+ if self.capturing:
+ # Nested target tag not allowed.
+ raise ValueError(f"Nested <{self.target_tag}> tags are not allowed.")
+ self.capturing = True
+ elif self.capturing:
+ self.inner_html.append(self.get_starttag_text())
+
+ def handle_endtag(self, tag):
+ tag = tag.lower()
+ if self.capturing:
+ if tag == self.target_tag:
+ self.capturing = False
+ else:
+ self.inner_html.append(f"</{tag}>")
+
+ def handle_data(self, data):
+ if self.capturing:
+ self.inner_html.append(data)
+
+ def handle_entityref(self, name):
+ if self.capturing:
+ self.inner_html.append(f"&{name};")
+
+ def handle_charref(self, name):
+ if self.capturing:
+ self.inner_html.append(f"&#{name};")
+
+
+def extract_inner_html(html, tag):
+ """
+ Extracts the inner HTML of a tag that appears exactly once.
+ """
+ parser = SingleTagInnerHTMLExtractor(tag)
+ parser.feed(html)
+ parser.close()
+
+ if parser.tag_count == 0:
+ raise ValueError(f"<{tag}> not found in HTML.")
+ if parser.tag_count > 1:
+ raise ValueError(f"<{tag}> occurs more than once in HTML.")
+
+ return "".join(parser.inner_html)