From 004b788e60093488ea396d9615007b83faecd39a Mon Sep 17 00:00:00 2001 From: Sarah Boyce <42296566+sarahboyce@users.noreply.github.com> Date: Mon, 23 Mar 2026 14:32:13 +0100 Subject: Synced webpages from sitemap to Document model for search. (#2284) * Synced webpages from sitemap to Document model for search. * Removed is_searchable from blog Entry model. --- blog/admin.py | 3 +- blog/migrations/0007_remove_entry_is_searchable.py | 17 ++ blog/models.py | 9 - blog/tests.py | 20 --- djangoproject/scss/_style.scss | 2 +- djangoproject/sitemaps.py | 76 ++++++++ djangoproject/tests.py | 7 + djangoproject/urls/www.py | 3 + docs/management/commands/update_docs.py | 2 + docs/models.py | 112 ++++++------ docs/search.py | 40 ++--- docs/sitemaps.py | 23 +-- docs/tests/test_models.py | 198 +++++++++++++++------ docs/tests/test_utils.py | 38 +++- docs/utils.py | 57 ++++++ 15 files changed, 426 insertions(+), 181 deletions(-) create mode 100644 blog/migrations/0007_remove_entry_is_searchable.py create mode 100644 djangoproject/sitemaps.py diff --git a/blog/admin.py b/blog/admin.py index ceabd0ff..f4c3b764 100644 --- a/blog/admin.py +++ b/blog/admin.py @@ -16,10 +16,9 @@ class EntryAdmin(admin.ModelAdmin): "pub_date", "is_active", "is_published", - "is_searchable", "author", ) - list_filter = ("is_active", "is_searchable") + list_filter = ("is_active",) exclude = ("summary_html", "body_html") prepopulated_fields = {"slug": ("headline",)} raw_id_fields = ["social_media_card"] diff --git a/blog/migrations/0007_remove_entry_is_searchable.py b/blog/migrations/0007_remove_entry_is_searchable.py new file mode 100644 index 00000000..ac8fe879 --- /dev/null +++ b/blog/migrations/0007_remove_entry_is_searchable.py @@ -0,0 +1,17 @@ +# Generated by Django 5.2.7 on 2026-02-24 09:03 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ("blog", "0006_entry_is_searchable"), + ] + + operations = [ + migrations.RemoveField( + model_name="entry", + name="is_searchable", + ), + ] diff --git a/blog/models.py b/blog/models.py index 10e065a0..eb97b2ac 100644 --- a/blog/models.py +++ b/blog/models.py @@ -37,9 +37,6 @@ class EntryQuerySet(models.QuerySet): def active(self): return self.filter(is_active=True) - def searchable(self): - return self.filter(is_searchable=True) - class ContentFormat(models.TextChoices): REST = "reST", "reStructuredText" @@ -130,12 +127,6 @@ class Entry(models.Model): ), default=False, ) - is_searchable = models.BooleanField( - default=False, - help_text=_( - "Tick to make this entry appear in the Django documentation search." - ), - ) pub_date = models.DateTimeField( verbose_name=_("Publication date"), help_text=_( diff --git a/blog/tests.py b/blog/tests.py index d31ef6fa..5b6b0ea6 100644 --- a/blog/tests.py +++ b/blog/tests.py @@ -78,26 +78,6 @@ class EntryTestCase(DateTimeMixin, TestCase): transform=lambda entry: entry.headline, ) - def test_manager_searchable(self): - """ - Make sure that the Entry manager's `searchable` method works - """ - Entry.objects.create( - pub_date=self.yesterday, - is_searchable=False, - headline="not searchable", - slug="a", - ) - Entry.objects.create( - pub_date=self.yesterday, is_searchable=True, headline="searchable", slug="b" - ) - - self.assertQuerySetEqual( - Entry.objects.searchable(), - ["searchable"], - transform=lambda entry: entry.headline, - ) - def test_docutils_safe(self): """ Make sure docutils' file inclusion directives are disabled by default. diff --git a/djangoproject/scss/_style.scss b/djangoproject/scss/_style.scss index 4b5fcb66..581af11c 100644 --- a/djangoproject/scss/_style.scss +++ b/djangoproject/scss/_style.scss @@ -2718,7 +2718,7 @@ search.filters { position: relative; a { - padding: 10px 20px; + padding: 10px 15px; text-decoration: none; border-bottom: 3px solid transparent; transition: color 0.3s ease, border-bottom 0.3s ease; diff --git a/djangoproject/sitemaps.py b/djangoproject/sitemaps.py new file mode 100644 index 00000000..5fc3bd1c --- /dev/null +++ b/djangoproject/sitemaps.py @@ -0,0 +1,76 @@ +from dataclasses import dataclass + +from django.contrib import sitemaps +from django_hosts.resolvers import reverse + + +@dataclass +class URLObject: + name: str + host: str = "www" + + +class LocationAbsoluteUrlMixin: + def get_urls(self, site=None, **kwargs): + """ + Prevent the Django sitemap framework from prefixing the domain. + Use the absolute URL returned by location(). + """ + urls = [] + for item in self.items(): + loc = self.location(item) + urls.append( + { + "location": loc, + "lastmod": None, + "changefreq": self.changefreq, + "priority": self.priority, + } + ) + return urls + + +class TemplateViewSitemap(LocationAbsoluteUrlMixin, sitemaps.Sitemap): + priority = 0.5 + changefreq = "monthly" + + def items(self): + return [ + # accounts + URLObject("registration_register"), + # aggregator + URLObject("community-index"), + URLObject("community-ecosystem"), + URLObject("local-django-communities"), + # contact + URLObject("contact_foundation"), + # dashboard + URLObject("dashboard-index", host="dashboard"), + URLObject("metric-list", host="dashboard"), + # djangoproject + URLObject("homepage"), + URLObject("overview"), + URLObject("start"), + URLObject("code_of_conduct"), + URLObject("conduct_faq"), + URLObject("conduct_reporting"), + URLObject("conduct_enforcement"), + URLObject("conduct_changes"), + URLObject("diversity"), + URLObject("diversity_changes"), + # foundation + URLObject("foundation_meeting_archive_index"), + # fundraising + URLObject("fundraising:index"), + # members + URLObject("members:individual-members"), + URLObject("members:corporate-members"), + URLObject("members:corporate-members-join"), + URLObject("members:corporate-members-badges"), + URLObject("members:teams"), + # releases + URLObject("download"), + ] + + def location(self, item): + return reverse(item.name, host=item.host) diff --git a/djangoproject/tests.py b/djangoproject/tests.py index 05102e8c..2150fdec 100644 --- a/djangoproject/tests.py +++ b/djangoproject/tests.py @@ -202,6 +202,7 @@ class Header1Tests(ReleaseMixin, TestCase): "styleguide/", # Has multiple

examples. "admin/", # Admin templates are out of our control. "reset/done/", # Uses an admin template. + "sitemap.xml", ] resolver = get_resolver() urls = self.extract_patterns(resolver.url_patterns) @@ -222,3 +223,9 @@ class SecurityTxtTests(TestCase): self.assertEqual(response.status_code, HTTPStatus.OK) self.assertEqual(response["Content-Type"], "text/plain") self.assertIn("Expires:", response.content.decode()) + + +class SiteMapTests(TestCase): + def test_sitemap_renders(self): + response = self.client.get(reverse("sitemap")) + self.assertEqual(response.status_code, 200) diff --git a/djangoproject/urls/www.py b/djangoproject/urls/www.py index b6091983..de4f1818 100644 --- a/djangoproject/urls/www.py +++ b/djangoproject/urls/www.py @@ -13,6 +13,7 @@ from accounts import views as account_views from aggregator.feeds import CommunityAggregatorFeed, CommunityAggregatorFirehoseFeed from blog.feeds import WeblogEntryFeed from blog.sitemaps import WeblogSitemap +from djangoproject.sitemaps import TemplateViewSitemap from foundation.feeds import FoundationMinutesFeed from foundation.views import CoreDevelopers @@ -21,6 +22,7 @@ admin.autodiscover() sitemaps = { "weblog": WeblogSitemap, "flatpages": FlatPageSitemap, + "templates": TemplateViewSitemap, } @@ -136,6 +138,7 @@ urlpatterns = [ "sitemap.xml", cache_page(60 * 60 * 6)(sitemap_views.sitemap), {"sitemaps": sitemaps}, + name="sitemap", ), path( ".well-known/security.txt", diff --git a/docs/management/commands/update_docs.py b/docs/management/commands/update_docs.py index acc0069b..b5b01131 100644 --- a/docs/management/commands/update_docs.py +++ b/docs/management/commands/update_docs.py @@ -134,6 +134,8 @@ class Command(BaseCommand): if self.verbosity >= 1: self.stdout.write(f"Starting update for {release} at {datetime.now()}...") + release.sync_from_sitemap(force=force) + # checkout_dir is shared for all languages. checkout_dir = settings.DOCS_BUILD_ROOT / "sources" / release.version parent_build_dir = settings.DOCS_BUILD_ROOT / release.lang / release.version diff --git a/docs/models.py b/docs/models.py index b5ad2870..8dd7268c 100644 --- a/docs/models.py +++ b/docs/models.py @@ -1,6 +1,7 @@ import datetime import html import json +import logging import operator from functools import partial, reduce from pathlib import Path @@ -26,19 +27,21 @@ from django.utils.functional import cached_property from django.utils.html import strip_tags from django_hosts.resolvers import reverse -from blog.models import Entry from releases.models import Release from . import utils from .search import ( DEFAULT_TEXT_SEARCH_CONFIG, - SEARCHABLE_VIEWS, START_SEL, STOP_SEL, TSEARCH_CONFIG_LANGUAGES, DocumentationCategory, + fetch_html, get_document_search_vector, ) +from .utils import extract_inner_html + +logger = logging.getLogger(__name__) def get_search_config(lang): @@ -185,7 +188,7 @@ class DocumentRelease(models.Model): the database. Deletes all the release's documents first then reinserts them as needed. """ - self.documents.all().delete() + self.documents.exclude(metadata__parents=DocumentationCategory.WEBSITE).delete() # Read excluded paths from robots.docs.txt. robots_path = settings.BASE_DIR / "djangoproject" / "static" / "robots.docs.txt" @@ -216,65 +219,52 @@ class DocumentRelease(models.Model): metadata=document, config=get_search_config(self.lang), ) - for document in self.documents.all(): + for document in self.documents.exclude( + metadata__parents=DocumentationCategory.WEBSITE + ): document.metadata["breadcrumbs"] = list( Document.objects.breadcrumbs(document).values("title", "path") ) document.save(update_fields=("metadata",)) - self._sync_blog_to_db() - self._sync_views_to_db() + def sync_from_sitemap(self, force=False): + from djangoproject.urls.www import sitemaps - def _sync_blog_to_db(self): - """ - Sync the blog entries into search based on the release documents - support end date. - """ - if self.lang != "en": - return # The blog is only written in English currently + if not self.is_dev: + return - entries = Entry.objects.published().searchable() - Document.objects.bulk_create( - [ - Document( - release=self, - path=entry.get_absolute_url(), - title=entry.headline, - metadata={ - "body": entry.body_html, - "breadcrumbs": [ - { - "path": DocumentationCategory.WEBSITE, - "title": "News", - }, - ], - "parents": DocumentationCategory.WEBSITE, - "slug": entry.slug, - "title": entry.headline, - "toc": "", - }, - config=get_search_config(self.lang), - ) - for entry in entries - ] - ) + if force: + Document.objects.filter( + metadata__parents=DocumentationCategory.WEBSITE + ).delete() - def _sync_views_to_db(self): - """ - Sync the specific views into search based on the release documents - support end date. - """ - if self.lang != "en": - return # The searchable views are only written in English currently + doc_urls = set( + Document.objects.filter( + metadata__parents=DocumentationCategory.WEBSITE + ).values_list("path", flat=True) + ) - Document.objects.bulk_create( - [ - Document( + for sitemap in sitemaps.values(): + for url in sitemap().get_urls(): + path = url["location"] + if path in doc_urls: + continue + try: + page_html = fetch_html(path) + except ValueError: + logger.exception("Error indexing template view for search") + continue + try: + main_html = extract_inner_html(page_html, tag="main") + title = extract_inner_html(page_html, tag="h1") + except ValueError: + continue + Document.objects.create( release=self, - path=searchable_view.www_absolute_url, - title=searchable_view.page_title, + path=path, + title=title, metadata={ - "body": searchable_view.html, + "body": main_html, "breadcrumbs": [ { "path": DocumentationCategory.WEBSITE, @@ -282,15 +272,11 @@ class DocumentRelease(models.Model): }, ], "parents": DocumentationCategory.WEBSITE, - "slug": searchable_view.url_name, - "title": searchable_view.page_title, + "title": title, "toc": "", }, config=get_search_config(self.lang), ) - for searchable_view in SEARCHABLE_VIEWS - ] - ) def _clean_document_path(path): @@ -351,6 +337,20 @@ class DocumentQuerySet(models.QuerySet): config=models.F("config"), ) base_filter = Q(release_id=release.id) + if release.lang == settings.DEFAULT_LANGUAGE_CODE and not release.is_dev: + # Fetch the "dev" release explicitly so we can filter by release_id. + # This avoids JOINs in the main query and leverages the indexed FK, + # which is more efficient than filtering by release__release__version + # and release__lang. + dev_release = DocumentRelease.objects.get_by_version_and_lang( + version="dev", lang=settings.DEFAULT_LANGUAGE_CODE + ) + # Website content (non-docs content) is associated with the "dev" + # release. This is included in the search results. + base_filter |= Q( + release_id=dev_release.id, + metadata__parents=DocumentationCategory.WEBSITE, + ) if document_category: base_filter &= Q(metadata__parents__startswith=document_category) base_qs = ( diff --git a/docs/search.py b/docs/search.py index 0b7eaef1..4a779a75 100644 --- a/docs/search.py +++ b/docs/search.py @@ -1,11 +1,10 @@ -from dataclasses import dataclass +from urllib.parse import urlparse from django.contrib.postgres.search import SearchVector from django.db.models import TextChoices from django.db.models.fields.json import KeyTextTransform -from django.template.loader import get_template +from django.test import Client from django.utils.translation import gettext_lazy as _ -from django_hosts import reverse # Imported from # https://github.com/postgres/postgres/blob/REL_14_STABLE/src/bin/initdb/initdb.c#L659 @@ -81,25 +80,16 @@ class DocumentationCategory(TextChoices): return None -@dataclass -class SearchableView: - page_title: str - url_name: str - template: str - - @property - def html(self): - return get_template(self.template).render() - - @property - def www_absolute_url(self): - return reverse(self.url_name, host="www") - - -SEARCHABLE_VIEWS = [ - SearchableView( - page_title="Django's Ecosystem", - url_name="community-ecosystem", - template="aggregator/ecosystem.html", - ), -] +def fetch_html(url): + parsed = urlparse(url) + headers = {"HOST": parsed.netloc} # Use netloc to include the port. + client = Client(headers=headers, raise_request_exception=False) + response = client.get(parsed.path) + content_type = response.headers.get("Content-Type", "") + if response.status_code == 200 and "text/html" in content_type: + return response.text + raise ValueError( + f"Failed to fetch {url}, " + f"status code: {response.status_code}, " + f"Content-Type: {content_type}" + ) diff --git a/docs/sitemaps.py b/docs/sitemaps.py index 6f520aa7..c46ec093 100644 --- a/docs/sitemaps.py +++ b/docs/sitemaps.py @@ -1,10 +1,12 @@ from django.contrib.sitemaps import Sitemap +from djangoproject.sitemaps import LocationAbsoluteUrlMixin + from .models import Document from .search import DocumentationCategory -class DocsSitemap(Sitemap): +class DocsSitemap(LocationAbsoluteUrlMixin, Sitemap): def __init__(self, lang): self.lang = lang @@ -16,6 +18,9 @@ class DocsSitemap(Sitemap): .select_related("release__release") ) + def location(self, item): + return item.get_absolute_url() + def changefreq(self, obj): return "daily" @@ -33,19 +38,3 @@ class DocsSitemap(Sitemap): return 1 else: return 0.1 - - def _urls(self, page, site, protocol): - # XXX: To workaround bad interaction between contrib.sitemaps and - # django-hosts (scheme/domain would be repeated twice in URLs) - urls = [] - for item in self.paginator.page(page).object_list: - loc = item.get_absolute_url() - priority = self.priority(item) - url_info = { - "item": item, - "location": loc, - "changefreq": self.changefreq(item), - "priority": str(priority if priority is not None else ""), - } - urls.append(url_info) - return urls diff --git a/docs/tests/test_models.py b/docs/tests/test_models.py index a5e22065..1ad94281 100644 --- a/docs/tests/test_models.py +++ b/docs/tests/test_models.py @@ -5,9 +5,8 @@ from django.conf import settings from django.db import connection from django.test import TestCase from django.utils import timezone -from django_hosts import reverse -from blog.models import Entry +from blog.models import ContentFormat, Entry from releases.models import Release from ..models import Document, DocumentRelease @@ -184,6 +183,9 @@ class ManagerTests(TestCase): class DocumentManagerTest(TestCase): @classmethod def setUpTestData(cls): + cls.dev_release = DocumentRelease.objects.create( + lang=settings.DEFAULT_LANGUAGE_CODE + ) cls.release = DocumentRelease.objects.create( release=Release.objects.create(version="1.2.3"), ) @@ -358,6 +360,20 @@ class DocumentManagerTest(TestCase): "release": cls.release_fr, "title": "Notes de publication de Django 1.9.4", }, + { + "metadata": { + "body": "Main 1", + "breadcrumbs": [ + {"path": DocumentationCategory.WEBSITE, "title": "Website"} + ], + "parents": DocumentationCategory.WEBSITE, + "title": "Title 1", + "toc": "", + }, + "path": "example", + "release": cls.dev_release, + "title": "Blog post", + }, ] Document.objects.bulk_create(Document(**doc) for doc in documents) @@ -457,28 +473,21 @@ class DocumentManagerTest(TestCase): ), ) + def test_website_document_items_included_english(self): + self.assertQuerySetEqual( + Document.objects.search("Main", self.release), + ["Blog post"], + transform=attrgetter("title"), + ) + + def test_website_document_items_excluded_non_english(self): + self.assertEqual(Document.objects.search("Main", self.release_fr).count(), 0) + class UpdateDocTests(TestCase): @classmethod def setUpTestData(cls): - now = timezone.now() - cls.release = DocumentRelease.objects.create( - release=Release.objects.create( - version="1.0.0", - eol_date=now + datetime.timedelta(days=1), - ) - ) - cls.entry = Entry.objects.create( - pub_date=now, - is_active=True, - is_searchable=True, - headline="Searchable post", - slug="a", - body_html="

Searchable Blog Post

", - ) - cls.docs_documents = cls.release.documents.exclude( - metadata__parents=DocumentationCategory.WEBSITE - ) + cls.release = DocumentRelease.objects.create(is_default=True) def test_sync_to_db(self): self.release.sync_to_db( @@ -490,24 +499,8 @@ class UpdateDocTests(TestCase): } ] ) - self.assertQuerySetEqual( - self.release.documents.all(), - [ - "foo/bar", - reverse("community-ecosystem", host="www"), - self.entry.get_absolute_url(), - ], - ordered=False, - transform=attrgetter("path"), - ) - - def test_sync_to_db_skip_non_english(self): - """ - Releases must be English to include the blog and website results in search. - """ - non_english = DocumentRelease.objects.create(lang="es") - non_english.sync_to_db([]) - self.assertQuerySetEqual(non_english.documents.all(), []) + document = self.release.documents.get() + self.assertEqual(document.path, "foo/bar") def test_clean_path(self): self.release.sync_to_db( @@ -519,7 +512,7 @@ class UpdateDocTests(TestCase): } ] ) - document = self.docs_documents.get() + document = self.release.documents.get() self.assertEqual(document.path, "foo/bar") def test_title_strip_tags(self): @@ -533,7 +526,7 @@ class UpdateDocTests(TestCase): ] ) self.assertQuerySetEqual( - self.docs_documents.all(), + self.release.documents.all(), ["This is the title"], transform=attrgetter("title"), ) @@ -549,7 +542,7 @@ class UpdateDocTests(TestCase): ] ) self.assertQuerySetEqual( - self.docs_documents, + self.release.documents.all(), ["Title & title"], transform=attrgetter("title"), ) @@ -562,7 +555,7 @@ class UpdateDocTests(TestCase): {"current_page_name": "foo/3"}, ] ) - self.assertQuerySetEqual(self.docs_documents, []) + self.assertQuerySetEqual(self.release.documents.all(), []) def test_excluded_documents(self): """ @@ -590,6 +583,119 @@ class UpdateDocTests(TestCase): document = release.documents.get() self.assertEqual(document.path, "nonexcluded/bar") + def test_sync_to_db_not_delete_website_docs(self): + Document.objects.create( + release=self.release, + path="example_path", + title="Title 1", + metadata={ + "body": "Main 1", + "breadcrumbs": [ + {"path": DocumentationCategory.WEBSITE, "title": "Website"} + ], + "parents": DocumentationCategory.WEBSITE, + "title": "Title 1", + "toc": "", + }, + ) + self.release.sync_to_db([]) + self.assertEqual(Document.objects.filter(release=self.release).count(), 1) + + def test_sync_from_sitemap_skip_non_en_dev_release(self): + release = Release.objects.create(version="5.2") + blog_entry = Entry.objects.create( + pub_date=timezone.now() - datetime.timedelta(days=2), + slug="a", + body="test", + content_format=ContentFormat.HTML, + is_active=True, + ) + for lang, release_obj in [ + ("fr", release), + (settings.DEFAULT_LANGUAGE_CODE, release), + ]: + doc_release = DocumentRelease.objects.create( + lang=lang, + release=release_obj, + ) + with self.subTest(lang=lang, release=release_obj): + doc_release.sync_from_sitemap() + self.assertFalse( + Document.objects.filter(path=blog_entry.get_absolute_url()).exists() + ) + + def test_sync_from_sitemap(self): + blog_entry = Entry.objects.create( + pub_date=timezone.now() - datetime.timedelta(days=2), + slug="a", + body="test", + headline="Title 1", + content_format=ContentFormat.HTML, + is_active=True, + ) + self.release.sync_from_sitemap() + + document = Document.objects.get( + release=self.release, path=blog_entry.get_absolute_url() + ) + self.assertEqual(document.title, "Title 1") + self.assertIn("test", document.metadata["body"]) + self.assertEqual(document.metadata["title"], "Title 1") + self.assertEqual(document.metadata["toc"], "") + + def test_sync_from_sitemap_only_requests_non_existing(self): + blog_entry = Entry.objects.create( + pub_date=timezone.now() - datetime.timedelta(days=2), + slug="a", + body="test", + content_format=ContentFormat.HTML, + is_active=True, + ) + Document.objects.create( + release=self.release, + metadata={"parents": DocumentationCategory.WEBSITE}, + path=blog_entry.get_absolute_url(), + ) + self.release.sync_from_sitemap() + document = Document.objects.get( + release=self.release, path=blog_entry.get_absolute_url() + ) + # Confirm Document has not been updated. + self.assertEqual( + document.metadata, + {"parents": DocumentationCategory.WEBSITE}, + ) + + def test_sync_from_sitemap_force(self): + Document.objects.create( + release=self.release, + metadata={"parents": DocumentationCategory.WEBSITE}, + path="some_path", + ) + blog_entry = Entry.objects.create( + pub_date=timezone.now() - datetime.timedelta(days=2), + slug="a", + body="test", + content_format=ContentFormat.HTML, + is_active=True, + headline="Title 1", + ) + blog_url = blog_entry.get_absolute_url() + Document.objects.create( + release=self.release, + metadata={"parents": DocumentationCategory.WEBSITE}, + path=blog_url, + ) + self.release.sync_from_sitemap(force=True) + + document = Document.objects.get(release=self.release, path=blog_url) + # Confirm Document has been updated. + self.assertEqual(document.path, blog_url) + self.assertEqual(document.title, "Title 1") + self.assertIn("test", document.metadata["body"]) + self.assertEqual(document.metadata["title"], "Title 1") + self.assertEqual(document.metadata["toc"], "") + class DocumentUrlTests(TestCase): @classmethod @@ -623,11 +729,3 @@ class DocumentUrlTests(TestCase): ], transform=lambda doc: doc.get_absolute_url(), ) - - def test_document_url_documentation_category_website(self): - self.release._sync_views_to_db() - document_view = self.release.documents.get() - self.assertEqual( - document_view.get_absolute_url(), - "http://www.djangoproject.localhost:8000/community/ecosystem/", - ) diff --git a/docs/tests/test_utils.py b/docs/tests/test_utils.py index 4ab65002..f676540b 100644 --- a/docs/tests/test_utils.py +++ b/docs/tests/test_utils.py @@ -3,7 +3,7 @@ from pathlib import Path from django.test import SimpleTestCase -from ..utils import get_doc_path, sanitize_for_trigram +from ..utils import extract_inner_html, get_doc_path, sanitize_for_trigram class TestUtils(SimpleTestCase): @@ -38,3 +38,39 @@ class TestUtils(SimpleTestCase): ]: with self.subTest(query=query): self.assertEqual(sanitize_for_trigram(query), sanitized_query) + + def test_extract_inner_html(self): + for html, expected_output in [ + ("

Hello

", "

Hello

"), + ( + '
Test
' + "

Title

", + "

Title

", + ), + ("
& < > ©
", "& < > ©"), + ("
", ""), + ("
Hello world
", "Hello world"), + ("

Hi

Text

Bye

", "

Hi

Text

Bye

"), + ]: + with self.subTest(html=html): + self.assertEqual(extract_inner_html(html, tag="main"), expected_output) + + def test_extract_inner_html_multiple_same_tags_raises(self): + with self.assertRaisesMessage( + ValueError, "
occurs more than once in HTML." + ): + extract_inner_html( + "
One main
Two main
", tag="main" + ) + + def test_extract_inner_html_multiple_same_tags_nested_raises(self): + with self.assertRaisesMessage( + ValueError, "Nested
tags are not allowed." + ): + extract_inner_html( + "
One main
Two main
", tag="main" + ) + + def test_extract_inner_html_tag_not_found_raises(self): + with self.assertRaisesMessage(ValueError, "
not found in HTML."): + extract_inner_html("

Test

", tag="main") diff --git a/docs/utils.py b/docs/utils.py index bbbf5790..973f684b 100644 --- a/docs/utils.py +++ b/docs/utils.py @@ -1,5 +1,6 @@ import re import unicodedata +from html.parser import HTMLParser from pathlib import Path from django.conf import settings @@ -93,3 +94,59 @@ def get_module_path(name, full_path): if full_path.endswith(name_suffix): return full_path.removesuffix(name_suffix) return None + + +class SingleTagInnerHTMLExtractor(HTMLParser): + def __init__(self, target_tag): + super().__init__() + self.target_tag = target_tag.lower() + self.capturing = False + self.inner_html = [] + self.tag_count = 0 + + def handle_starttag(self, tag, attrs): + tag = tag.lower() + if tag == self.target_tag: + self.tag_count += 1 + if self.capturing: + # Nested target tag not allowed. + raise ValueError(f"Nested <{self.target_tag}> tags are not allowed.") + self.capturing = True + elif self.capturing: + self.inner_html.append(self.get_starttag_text()) + + def handle_endtag(self, tag): + tag = tag.lower() + if self.capturing: + if tag == self.target_tag: + self.capturing = False + else: + self.inner_html.append(f"") + + def handle_data(self, data): + if self.capturing: + self.inner_html.append(data) + + def handle_entityref(self, name): + if self.capturing: + self.inner_html.append(f"&{name};") + + def handle_charref(self, name): + if self.capturing: + self.inner_html.append(f"&#{name};") + + +def extract_inner_html(html, tag): + """ + Extracts the inner HTML of a tag that appears exactly once. + """ + parser = SingleTagInnerHTMLExtractor(tag) + parser.feed(html) + parser.close() + + if parser.tag_count == 0: + raise ValueError(f"<{tag}> not found in HTML.") + if parser.tag_count > 1: + raise ValueError(f"<{tag}> occurs more than once in HTML.") + + return "".join(parser.inner_html) -- cgit v1.3