diff --git a/pyproject.toml b/pyproject.toml index 8e27708..2bbad08 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,7 @@ dependencies = [ "requests==2.32.3", "pydantic==2.8.2", "zimscraperlib==3.4.0", + "Jinja2==3.1.3", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] @@ -23,7 +24,7 @@ lint = [ "ruff==0.5.1", ] check = [ - "pyright==1.1.370", + "pyright==1.1.374", ] test = [ "pytest==8.2.2", diff --git a/src/devdocs2zim/assets/COPYRIGHT b/src/devdocs2zim/assets/COPYRIGHT new file mode 100644 index 0000000..9c520b8 --- /dev/null +++ b/src/devdocs2zim/assets/COPYRIGHT @@ -0,0 +1,13 @@ +Copyright 2013-2024 Thibaut Courouble and other contributors + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. This appears to be unused. slug: str + def sort_precedence(self) -> SortPrecedence: + """Determines where this section should be displayed in the navigation.""" + if BEFORE_CONTENT_PATTERN.match(self.name): + return SortPrecedence.BEFORE_CONTENT + + if AFTER_CONTENT_PATTERN.match(self.name): + return SortPrecedence.AFTER_CONTENT + + return SortPrecedence.CONTENT + + +class NavigationSection: + """Represents a single section of a devdocs navigation tree.""" + + def __init__(self, section: DevdocsIndexType, links: list[DevdocsIndexEntry]): + """Initializes NavigationSection. + + Parameters: + section: Heading information for the group of links. + links: Links to display in the section. + """ + self.name = section.name + self.count = section.count + self.links = links + + self._contained_pages = {link.path_without_fragment for link in links} + + def contains_page(self, page_path: str) -> bool: + """Returns whether this section contains the given page.""" + return page_path in self._contained_pages + class DevdocsIndex(BaseModel): """Represents entries in the //index.json file for each resource.""" @@ -102,10 +156,36 @@ class DevdocsIndex(BaseModel): entries: list[DevdocsIndexEntry] # List of "types" or section headings. - # These are displayed mostly in order, except regular expressions are used to sort: - # https://github.com/freeCodeCamp/devdocs/blob/e28f81d3218bdbad7eac0540c58c11c7fe1d33d3/assets/javascripts/collections/types.js#L3 + # These are displayed in the order they're found grouped by sort_precedence. types: list[DevdocsIndexType] + def build_navigation(self) -> list[NavigationSection]: + """Builds a navigation hierarchy that's soreted correctly for rendering.""" + + sections_by_precedence: dict[SortPrecedence, list[DevdocsIndexType]] = ( + defaultdict(list) + ) + for section in self.types: + sections_by_precedence[section.sort_precedence()].append(section) + + links_by_section_name: dict[str, list[DevdocsIndexEntry]] = defaultdict(list) + for entry in self.entries: + if entry.type is None: + continue + links_by_section_name[entry.type].append(entry) + + output: list[NavigationSection] = [] + for precedence in SortPrecedence: + for section in sections_by_precedence[precedence]: + output.append( + NavigationSection( + section=section, + links=links_by_section_name[section.name], + ) + ) + + return output + class DevdocsClient: """Utility functions to read data from devdocs.""" diff --git a/src/devdocs2zim/generator.py b/src/devdocs2zim/generator.py index c903a7d..e19b44c 100644 --- a/src/devdocs2zim/generator.py +++ b/src/devdocs2zim/generator.py @@ -1,20 +1,37 @@ +# ruff: noqa: S607 + import argparse +import datetime import os import re +import time from collections import defaultdict +from pathlib import Path +from jinja2 import Environment, FileSystemLoader, select_autoescape from pydantic import BaseModel from zimscraperlib.constants import ( # pyright: ignore[reportMissingTypeStubs] MAXIMUM_DESCRIPTION_METADATA_LENGTH, MAXIMUM_LONG_DESCRIPTION_METADATA_LENGTH, RECOMMENDED_MAX_TITLE_LENGTH, ) +from zimscraperlib.zim import ( # pyright: ignore[reportMissingTypeStubs] + Creator, + StaticItem, +) from devdocs2zim.client import ( DevdocsClient, + DevdocsIndex, DevdocsMetadata, ) -from devdocs2zim.constants import logger +from devdocs2zim.constants import LANGUAGE_ISO_639_3, NAME, ROOT_DIR, VERSION, logger + +# Content to display for pages missing from DevDocs. +MISSING_PAGE = ( + "

This documentation is missing.

" + "

This is an error with DevDocs, not your ZIM reader e.g. Kiwix.

" +) class InvalidFormatError(Exception): @@ -274,6 +291,62 @@ def __init__( os.makedirs(self.output_folder, exist_ok=True) + # jinja2 environment setup + self.env = Environment( # type: ignore + loader=FileSystemLoader(ROOT_DIR.joinpath("templates")), + autoescape=select_autoescape(), + ) + + self.page_template = self.env.get_template("page.html") # type: ignore + self.license_template = self.env.get_template("license.html") # type: ignore + + self.logo_path = self.asset_path("devdocs_48.png") + self.copyright_path = self.asset_path("COPYRIGHT") + self.license_path = self.asset_path("LICENSE") + + @staticmethod + def asset_path(name: str) -> Path: + """Returns the path to name in the assets folder. + + Raises ValueError if the resource doesn't exist. + """ + path = ROOT_DIR.joinpath("assets", name) + if not path.exists(): + raise ValueError(f"File not found at {path}") + return path + + def load_common_files(self) -> list[StaticItem]: + """Loads common assets for the output.""" + static_files: list[StaticItem] = [] + + logger.info("Fetching common CSS...") + app_css = self.devdocs_client.read_application_css() + logger.debug(f" Found app CSS with {len(app_css)} chars.") + static_files.append( + StaticItem( + path="application.css", + content=app_css, + is_front=False, + mimetype="text/css", + ) + ) + + static_files.append( + StaticItem( + # Documentation doesn't end in .html so this file won't + # conflict. + path="licenses.html", + content=self.license_template.render( # type: ignore + copyright=self.copyright_path.read_text(), + license=self.license_path.read_text(), + ), + is_front=True, + mimetype="text/html", + ) + ) + + return static_files + def run(self) -> None: """Run the generator to fetch content and produce ZIMs.""" @@ -281,8 +354,152 @@ def run(self) -> None: all_docs = self.devdocs_client.list_docs() selected_doc_metadata = self.doc_filter.filter(all_docs) + # Check formatting early to bail if any templates are invalid. + for doc_metadata in selected_doc_metadata: + self.zim_config.format(doc_metadata.placeholders()) + + common_resources = self.load_common_files() + # List all docs and copy one by one for doc_metadata in selected_doc_metadata: - logger.info(f"Fetching {doc_metadata.slug}") + self.generate_zim( + doc_metadata, + common_resources, + ) + + def generate_zim( + self, doc_metadata: DevdocsMetadata, common_resources: list[StaticItem] + ): + """Generates a zim for a single document.""" + logger.info(f"Generating ZIM for {doc_metadata.slug}") + + formatted_config = self.zim_config.format(doc_metadata.placeholders()) + zim_path = Path(self.output_folder, f"{formatted_config.name_format}.zim") + + if zim_path.exists(): + logger.warning(f" Skipping, {zim_path} already exists.") + return + + logger.info(f" Writing to: {zim_path}") + + creator = Creator(zim_path, "index") + creator.config_metadata( + Name=formatted_config.name_format, + Title=formatted_config.title_format, + Publisher=formatted_config.publisher, + Date=datetime.datetime.now(tz=datetime.UTC).date(), + Creator=formatted_config.creator, + Description=formatted_config.description_format, + LongDescription=formatted_config.long_description_format, + Language=LANGUAGE_ISO_639_3, + Tags=formatted_config.tags, + Scraper=f"{NAME} v{VERSION}", + Illustration_48x48_at_1=self.logo_path.read_bytes(), + ) + + # Disable indexing because it won't be available in the JS frontend + # and causes significant performance issues with rendered sidebars. + creator.config_indexing(False) + + # Start creator early to detect problems early. + with creator as started_creator: + logger.info(" Fetching the index...") + index = self.devdocs_client.get_index(doc_metadata.slug) + logger.debug(f" The index has {len(index.entries)} entries.") + + logger.info(" Fetching the document database...") + db = self.devdocs_client.get_db(doc_metadata.slug) + logger.debug(f" The database has {len(db)} entries.") + + self.add_zim_contents( + creator=started_creator, + doc_metadata=doc_metadata, + index=index, + db=db, + common_resources=common_resources, + ) + + def add_zim_contents( + self, + creator: Creator, + doc_metadata: DevdocsMetadata, + index: DevdocsIndex, + db: dict[str, str], + common_resources: list[StaticItem], + ): + """Adds the doc conents to the ZIM. + + Parameters: + creator: ZIM writer. + doc_metadata: Document metadata for generating common pages. + index: Documentation index for the navigation bar. + db: Mapping between documentation path and HTML content. + common_resources: Static content to add to the documentation. + """ - raise NotImplementedError("ZIM creation is not yet implemented") + logger.info(" Adding common resources...") + for resource in common_resources: + creator.add_item(resource) # type: ignore + + # Set the title for each page to the navigation item that opens the page + # to the top i.e. without a fragment if it exists. Otherwise, the first + # navigation item that opens the page. + page_to_title: dict[str, str] = {} + for entry in index.entries: + path_without_fragment = entry.path_without_fragment + if path_without_fragment == entry.path: + page_to_title[path_without_fragment] = entry.name + elif path_without_fragment not in page_to_title: + page_to_title[path_without_fragment] = entry.name + + # Explicitly inject the index. + page_to_title["index"] = f"{doc_metadata.name} Documentation" + + nav_sections = index.build_navigation() + + logger.info(f" Rendering {len(page_to_title)} pages...") + counter = 0 + render_delta = 0.0 + add_delta = 0.0 + for path, title in page_to_title.items(): + num_slashes = path.count("/") + rel_prefix = "../" * num_slashes + + start_render = time.time() + page_content = self.page_template.render( # type: ignore + rel_prefix=rel_prefix, + nav_sections=nav_sections, + devdocs_metadata=doc_metadata, + title=title, + path=path, + # Fill missing DevDocs content with indications that the issue + # isn't with this generator. + content=db.get(path, MISSING_PAGE), + ) + start_add = time.time() + creator.add_item_for( # type: ignore + path, + title=title, + content=page_content, # type: ignore + is_front=True, + # Compression is needed because images are embedded as Base64 and + # navigation is similar across pages. + should_compress=True, + mimetype="text/html", + ) + end = time.time() + + # Tracking metadta + render_delta += start_add - start_render + add_delta += end - start_add + counter += 1 + if counter % 100 == 0: + logger.debug( + f" Progress {counter} / {len(page_to_title)} pages " + f"({render_delta:0.2f}s rendering, {add_delta:0.2f}s adding)" + ) + + logger.info( + " Finished adding contents. " + f"({render_delta:0.2f}s rendering, {add_delta:0.2f}s adding)" + ) diff --git a/src/devdocs2zim/templates/license.html b/src/devdocs2zim/templates/license.html new file mode 100644 index 0000000..fd1f51c --- /dev/null +++ b/src/devdocs2zim/templates/license.html @@ -0,0 +1,17 @@ +{# Devdocs is an SPA so each page will have (nearly) identical content. #} + + + + Open-source License Information + + +

This work contains resources from DevDocs.io licensed under + the following license.

+ +


{{ copyright }}
+ +


{{ license }}
+ + diff --git a/src/devdocs2zim/templates/page.html b/src/devdocs2zim/templates/page.html new file mode 100644 index 0000000..dac255b --- /dev/null +++ b/src/devdocs2zim/templates/page.html @@ -0,0 +1,47 @@ +{# Devdocs is an SPA so each page will have (nearly) identical content. #} + + + + {{title}} + + + + +
+ +
+ {{devdocs_metadata.name}} +
+ {% for section in nav_sections %} +
+ + + {{ section.count | safe}} + {{ section.name }} + + +
+ {% for link in section.links %} + + {{ link.name }} + + {% endfor %} +
+ {% endfor %} + Open-source Licenses +
{{ content | safe }}
+ + + diff --git a/tests/test_basic.py b/tests/test_basic.py new file mode 100644 index 0000000..39ac61e --- /dev/null +++ b/tests/test_basic.py @@ -0,0 +1,7 @@ +# pyright: strict, reportUnusedExpression=false + +from devdocs2zim.__about__ import __version__ + + +def test_version(): + assert "dev" in __version__