From 2df177e7e281177fbab5751e96c048285efb4021 Mon Sep 17 00:00:00 2001 From: Joseph Lewis III Date: Mon, 23 Sep 2024 11:07:29 -0700 Subject: [PATCH] Added FTS support. --- Dockerfile | 1 + pyproject.toml | 3 ++- src/devdocs2zim/generator.py | 26 ++++++++++++++++++-------- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/Dockerfile b/Dockerfile index e0f8f56..e4723a8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,6 +5,7 @@ LABEL org.opencontainers.image.source https://github.com/openzim/devdocs RUN apt-get update \ && apt-get install -y --no-install-recommends \ libmagic1 \ + libcairo2 \ && rm -rf /var/lib/apt/lists/* \ && python -m pip install --no-cache-dir -U \ pip diff --git a/pyproject.toml b/pyproject.toml index 2bbad08..5e69d6e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,8 +10,9 @@ readme = "README.md" dependencies = [ "requests==2.32.3", "pydantic==2.8.2", - "zimscraperlib==3.4.0", + "zimscraperlib==4.0.0", "Jinja2==3.1.3", + "beautifulsoup4==4.12.3", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] diff --git a/src/devdocs2zim/generator.py b/src/devdocs2zim/generator.py index 9b8e76f..fd4dda1 100644 --- a/src/devdocs2zim/generator.py +++ b/src/devdocs2zim/generator.py @@ -5,6 +5,7 @@ from collections import defaultdict from pathlib import Path +from bs4 import BeautifulSoup from jinja2 import Environment, FileSystemLoader, select_autoescape from pydantic import BaseModel from zimscraperlib.constants import ( # pyright: ignore[reportMissingTypeStubs] @@ -16,7 +17,11 @@ Creator, StaticItem, ) +from zimscraperlib.zim.indexing import ( # pyright: ignore[reportMissingTypeStubs] + IndexData, +) +# pyright: ignore[reportMissingTypeStubs] from devdocs2zim.client import ( DevdocsClient, DevdocsIndex, @@ -339,6 +344,7 @@ def load_common_files(self) -> list[StaticItem]: content=app_css, is_front=False, mimetype="text/css", + auto_index=False, ) ) @@ -353,6 +359,7 @@ def load_common_files(self) -> list[StaticItem]: ), is_front=True, mimetype="text/plain", + auto_index=False, ) ) @@ -419,10 +426,6 @@ def generate_zim( Illustration_48x48_at_1=self.logo_path.read_bytes(), ) - # Disable indexing because it won't be available in the JS frontend - # and causes significant performance issues with rendered sidebars. - creator.config_indexing(False) - # Start creator early to detect problems early. with creator as started_creator: logger.info(" Fetching the index...") @@ -496,14 +499,16 @@ def add_zim_contents( num_slashes = path.count("/") rel_prefix = "../" * num_slashes - content = MISSING_PAGE - if path in db: - content = db.get(path) - else: + content = db.get(path, MISSING_PAGE) + if path not in db: logger.warning( f" DevDocs is missing content for {title!r} at {path!r}." ) + plain_content = " ".join( + BeautifulSoup(content, features="lxml").find_all(string=True) + ) + # NOTE: Profiling indicates Jinja templating takes about twice # the CPU time as adding items without compression. This appears to # be because of the navigation bar. @@ -527,6 +532,11 @@ def add_zim_contents( # navigation bar. should_compress=True, mimetype="text/html", + # Only index page content rather than navigation data. + index_data=IndexData( + title=title, + content=plain_content, + ), ) # Tracking metadta