From fe2df90fea93a923885178ec250f0c9a5bf7ff99 Mon Sep 17 00:00:00 2001 From: Johannes Hoppe Date: Sat, 7 Feb 2026 15:23:10 +0100 Subject: [PATCH 01/16] feat: transform relative links to absolute paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix issue where #anchor links navigate to /#anchor. The build now transforms relative links to absolute paths: - #section → /blog/slug#section - ../other-slug → /blog/other-slug - ../other-slug#section → /blog/other-slug#section Uses path.posix.resolve() for cross-platform path resolution. linkBasePath is derived from folder structure (folder = URL path). Also fixes: mailto:, tel:, ftp:// links are now correctly preserved (isAbsoluteUrl uses /^\w+:/ regex to match all protocols). Includes comprehensive documentation of URL transformation system. --- shared/base.utils.ts | 12 +- shared/jekyll-markdown-parser.spec.ts | 363 +++++++++++++++++++++++--- shared/jekyll-markdown-parser.ts | 120 ++++++++- 3 files changed, 438 insertions(+), 57 deletions(-) diff --git a/shared/base.utils.ts b/shared/base.utils.ts index 3d35f1f..5ed1017 100644 --- a/shared/base.utils.ts +++ b/shared/base.utils.ts @@ -76,9 +76,11 @@ export async function markdownToEntry( markdown: string, folder: string, baseUrl: string, - blogPostsFolder: string + blogPostsFolder: string, + linkBasePath: string ): Promise { - const parser = new JekyllMarkdownParser(baseUrl + folder + '/'); + const imageBaseUrl = baseUrl + folder + '/'; + const parser = new JekyllMarkdownParser(imageBaseUrl, linkBasePath); const parsedJekyllMarkdown = parser.parse(markdown); const meta: Record = parsedJekyllMarkdown.parsedYaml; @@ -113,10 +115,14 @@ export async function getEntryList(entriesFolder: string, m const entryDirs = await readFolders(entriesFolder); const entries: T[] = []; + // Content type from folder structure: ../blog → blog, ../material → material + const contentType = path.basename(entriesFolder); + for (const entryDir of entryDirs) { const readmePath = path.join(entriesFolder, entryDir, README_FILE); const readme = await readMarkdownFile(readmePath); - const entry = await markdownToEntry(readme, entryDir, markdownBaseUrl, entriesFolder); + const linkBasePath = '/' + contentType + '/' + entryDir; + const entry = await markdownToEntry(readme, entryDir, markdownBaseUrl, entriesFolder, linkBasePath); entries.push(entry); } diff --git a/shared/jekyll-markdown-parser.spec.ts b/shared/jekyll-markdown-parser.spec.ts index c905319..37f868b 100644 --- a/shared/jekyll-markdown-parser.spec.ts +++ b/shared/jekyll-markdown-parser.spec.ts @@ -385,6 +385,7 @@ describe('Configured marked behavior (baseline)', () => { */ describe('JekyllMarkdownParser', () => { const baseUrl = 'https://example.com/blog/my-post/'; + const linkBasePath = '/blog/my-post'; describe('Comprehensive regression test (marked upgrade safety)', () => { /** @@ -392,7 +393,7 @@ describe('JekyllMarkdownParser', () => { * If this test fails, the upgrade broke something important! */ it('should produce expected output for comprehensive blog post', () => { - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const { parsedYaml, html } = parser.parse(COMPREHENSIVE_BLOG_POST); // === YAML Frontmatter === @@ -464,7 +465,7 @@ describe('JekyllMarkdownParser', () => { * 3. Update EXPECTED_HTML_WITH_IMAGE_TRANSFORM only if the change is intentional */ it('should produce EXACT HTML output (character-by-character)', () => { - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(COMPREHENSIVE_BLOG_POST); expect(result.html).toBe(EXPECTED_HTML_WITH_IMAGE_TRANSFORM); @@ -482,7 +483,7 @@ author: John Doe This is a test. `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const { parsedYaml, html, markdown } = parser.parse(input); expect(parsedYaml.title).toBe('Test Post'); @@ -494,7 +495,7 @@ This is a test. it('should throw for markdown without frontmatter', () => { const input = '# Just Markdown\n\nNo frontmatter here.'; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); expect(() => parser.parse(input)).toThrow('YAML frontmatter is required'); }); @@ -508,7 +509,7 @@ title: Test ![Alt text](image.png) `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain(`src="${baseUrl}image.png"`); @@ -522,7 +523,7 @@ title: Test ![Alt text](image.png "Image Title") `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('title="Image Title"'); @@ -537,7 +538,7 @@ title: Test ![External](https://other.com/image.png) `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('src="https://other.com/image.png"'); @@ -551,7 +552,7 @@ title: Test ![Alt](./image.png) `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain(`src="${baseUrl}image.png"`); @@ -565,7 +566,7 @@ title: Test ![Data]() `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('src=""'); @@ -578,7 +579,7 @@ title: Test ![Icon](assets/img/icon.svg) `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('src="assets/img/icon.svg"'); @@ -594,7 +595,7 @@ title: Test Photo `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain(`src="${baseUrl}photo.jpg"`); @@ -608,7 +609,7 @@ title: Test Photo `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain(`src="${baseUrl}photo.jpg"`); @@ -622,7 +623,7 @@ title: Test Photo `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain(`src="${baseUrl}images/photo.jpg"`); @@ -635,7 +636,7 @@ title: Test External `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('src="https://other.com/image.png"'); @@ -649,7 +650,7 @@ title: Test CDN `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('src="//cdn.example.com/image.png"'); @@ -663,7 +664,7 @@ title: Test Data `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('src=""'); @@ -677,7 +678,7 @@ title: Test Icon `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('src="assets/img/icon.svg"'); @@ -691,7 +692,7 @@ title: Test Logo `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('src="/images/logo.png"'); @@ -705,7 +706,7 @@ title: Test Photo `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain(`src="${baseUrl}photo.jpg"`); @@ -723,7 +724,7 @@ title: Test Second Third `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain(`src="${baseUrl}first.jpg"`); @@ -738,7 +739,7 @@ title: Test Photo `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain(`src='${baseUrl}photo.jpg'`); @@ -752,7 +753,7 @@ title: Test External `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain("src='https://example.com/external.png'"); @@ -773,7 +774,7 @@ title: Test This has highlighted text and HTML. `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('highlighted'); @@ -789,7 +790,7 @@ title: Test

Custom styled content

`; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('
'); @@ -803,7 +804,7 @@ title: Test A special image `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('class="rounded shadow"'); @@ -821,7 +822,7 @@ title: Test Example \`\`\` `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); // Code is escaped and syntax-highlighted by highlight.js @@ -842,7 +843,7 @@ title: Test Code \`\`\` `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain(`src="${baseUrl}real-image.jpg"`); @@ -866,7 +867,7 @@ title: Test ![HTTP Image](http://insecure.com/image.png) `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('src="http://insecure.com/image.png"'); @@ -880,7 +881,7 @@ title: Test HTTP `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('src="http://insecure.com/image.png"'); @@ -896,7 +897,7 @@ title: Test ![He said "hello"](image.png) `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); // Quotes should be escaped to prevent broken HTML @@ -915,7 +916,7 @@ title: Test ![Alt](image.png "Title with "quotes"") `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); // Marked does NOT parse this as an image - it becomes literal text @@ -930,7 +931,7 @@ title: Test ![Array](image.png) `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('alt="Array<string>"'); @@ -943,7 +944,7 @@ title: Test ![Tom & Jerry](image.png) `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('alt="Tom & Jerry"'); @@ -953,7 +954,7 @@ title: Test describe('YAML frontmatter edge cases', () => { it('should handle Windows line endings (CRLF)', () => { const input = '---\r\ntitle: Test\r\n---\r\n\r\n# Hello'; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const { parsedYaml, html } = parser.parse(input); expect(parsedYaml.title).toBe('Test'); @@ -963,7 +964,7 @@ title: Test it('should throw for only one separator (no valid frontmatter)', () => { const input = '---\nThis is not YAML, just a horizontal rule\n\n# Hello'; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); expect(() => parser.parse(input)).toThrow('YAML frontmatter is required'); }); @@ -979,7 +980,7 @@ title: Test This is after a horizontal rule. `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.parsedYaml.title).toBe('Test'); @@ -989,7 +990,7 @@ This is after a horizontal rule. it('should handle trailing whitespace after --- separator', () => { const input = '--- \ntitle: Test\n---\t\n\n# Hello'; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.parsedYaml.title).toBe('Test'); @@ -1005,13 +1006,14 @@ This is after a horizontal rule. // 2. transformRelativeImagePaths runs on the ENTIRE HTML output // 3. It must NOT add baseUrl again to URLs that already start with the placeholder const placeholderBaseUrl = `${MARKDOWN_BASE_URL_PLACEHOLDER}/blog/my-post/`; + const placeholderLinkPath = '/blog/my-post'; const input = `--- title: Test --- ![Screenshot](screenshot.png) `; - const parser = new JekyllMarkdownParser(placeholderBaseUrl); + const parser = new JekyllMarkdownParser(placeholderBaseUrl, placeholderLinkPath); const result = parser.parse(input); // Should have exactly ONE placeholder prefix, not two! @@ -1022,13 +1024,14 @@ title: Test it('should NOT double-prefix raw HTML images with placeholder in src', () => { // Edge case: What if someone manually writes the placeholder in HTML? const placeholderBaseUrl = `${MARKDOWN_BASE_URL_PLACEHOLDER}/blog/my-post/`; + const placeholderLinkPath = '/blog/my-post'; const input = `--- title: Test --- Already prefixed `; - const parser = new JekyllMarkdownParser(placeholderBaseUrl); + const parser = new JekyllMarkdownParser(placeholderBaseUrl, placeholderLinkPath); const result = parser.parse(input); // Should NOT add another prefix @@ -1037,6 +1040,284 @@ title: Test }); }); + describe('Relative link transformation', () => { + it('should transform #anchor to absolute path', () => { + const input = `--- +title: Test +--- + +Check the [introduction](#introduction) section. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="/blog/my-post#introduction"'); + }); + + it('should transform ../sibling-slug to absolute path', () => { + const input = `--- +title: Test +--- + +See [other article](../other-post) for more. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="/blog/other-post"'); + }); + + it('should transform ../sibling-slug#section to absolute path with anchor', () => { + const input = `--- +title: Test +--- + +See [Angular 10](../2020-06-angular10#setup) for details. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="/blog/2020-06-angular10#setup"'); + }); + + it('should NOT transform external https:// links', () => { + const input = `--- +title: Test +--- + +Check [Angular docs](https://angular.io/docs) for more. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="https://angular.io/docs"'); + }); + + it('should NOT transform external http:// links', () => { + const input = `--- +title: Test +--- + +Check [old site](http://example.com) for more. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="http://example.com"'); + }); + + it('should NOT transform already-absolute paths starting with /', () => { + const input = `--- +title: Test +--- + +Check [another post](/blog/2023-01-other-post) for more. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="/blog/2023-01-other-post"'); + }); + + it('should NOT transform already-absolute paths with hash', () => { + const input = `--- +title: Test +--- + +Check [section](/blog/2023-01-other-post#setup) for more. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="/blog/2023-01-other-post#setup"'); + }); + + it('should NOT transform absolute paths in raw HTML anchor tags', () => { + const input = `--- +title: Test +--- + +Other post +Section link +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="/blog/other-post"'); + expect(result.html).toContain('href="/blog/other-post#section"'); + }); + + it('should NOT transform https:// links in raw HTML anchor tags', () => { + const input = `--- +title: Test +--- + +Angular Docs +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="https://angular.io/guide/components"'); + }); + + it('should NOT transform mailto: links', () => { + const input = `--- +title: Test +--- + +Contact us at [team@example.com](mailto:team@example.com). +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="mailto:team@example.com"'); + }); + + it('should NOT transform tel: links', () => { + const input = `--- +title: Test +--- + +Call us at [+49 123 456](tel:+49123456). +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="tel:+49123456"'); + }); + + it('should NOT transform ftp:// links', () => { + const input = `--- +title: Test +--- + +Download from [FTP](ftp://files.example.com/file.zip). +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="ftp://files.example.com/file.zip"'); + }); + + it('should NOT transform mailto: in raw HTML', () => { + const input = `--- +title: Test +--- + +Mail +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="mailto:team@angular-buch.com"'); + }); + + it('should transform ./relative links to current path', () => { + const input = `--- +title: Test +--- + +See [local file](./diagram.svg) for illustration. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="/blog/my-post/diagram.svg"'); + }); + + it('should transform multiple anchor links in TOC', () => { + const input = `--- +title: Test +--- + +## Inhalt + +- [Einleitung](#einleitung) +- [Hauptteil](#hauptteil) +- [Fazit](#fazit) +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="/blog/my-post#einleitung"'); + expect(result.html).toContain('href="/blog/my-post#hauptteil"'); + expect(result.html).toContain('href="/blog/my-post#fazit"'); + }); + + it('should handle raw HTML anchor tags with relative hrefs', () => { + const input = `--- +title: Test +--- + +Jump to section +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="/blog/my-post#section"'); + }); + + it('should preserve other attributes on anchor tags', () => { + const input = `--- +title: Test +--- + +Section +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="/blog/my-post#section"'); + expect(result.html).toContain('class="nav-link"'); + expect(result.html).toContain('id="toc-1"'); + }); + + it('should work with material paths', () => { + const materialLinkPath = '/material/signal-forms'; + const input = `--- +title: Test +--- + +See [other material](../other-material#section) for more. +`; + const parser = new JekyllMarkdownParser(baseUrl, materialLinkPath); + const result = parser.parse(input); + + expect(result.html).toContain('href="/material/other-material#section"'); + }); + + it('should handle deeply nested relative paths', () => { + const input = `--- +title: Test +--- + +See [root](../../other) for more. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="/other"'); + }); + + it('should NOT transform links inside code blocks', () => { + const input = `--- +title: Test +--- + +\`\`\`html +Link in code +\`\`\` +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + // Code is escaped by highlight.js, so the link should not be transformed + // The important assertion: no transformed href in the output + expect(result.html).toContain('language-html'); + expect(result.html).not.toContain('href="/blog/my-post#section"'); + }); + }); + describe('baseUrl edge cases', () => { it('should work correctly when baseUrl has no trailing slash', () => { const baseUrlNoSlash = 'https://example.com/blog/my-post'; @@ -1046,7 +1327,7 @@ title: Test ![Alt](image.png) `; - const parser = new JekyllMarkdownParser(baseUrlNoSlash); + const parser = new JekyllMarkdownParser(baseUrlNoSlash, linkBasePath); const result = parser.parse(input); // Without trailing slash, path gets concatenated directly @@ -1062,7 +1343,7 @@ title: Test ![Alt](image.png) `; - const parser = new JekyllMarkdownParser(baseUrlWithSlash); + const parser = new JekyllMarkdownParser(baseUrlWithSlash, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('src="https://example.com/blog/my-post/image.png"'); diff --git a/shared/jekyll-markdown-parser.ts b/shared/jekyll-markdown-parser.ts index c40dad1..9c86d78 100644 --- a/shared/jekyll-markdown-parser.ts +++ b/shared/jekyll-markdown-parser.ts @@ -1,9 +1,14 @@ +import { posix as path } from 'path'; import { load } from 'js-yaml'; import { Marked, Renderer, Tokens } from 'marked'; import { markedHighlight } from 'marked-highlight'; import { gfmHeadingId } from 'marked-gfm-heading-id'; import hljs from 'highlight.js'; +/** + * Placeholder for image base URL. Replaced at runtime by the Angular app. + * See "URL TRANSFORMATION SYSTEM" below for details. + */ export const MARKDOWN_BASE_URL_PLACEHOLDER = '%%MARKDOWN_BASE_URL%%'; /** @@ -14,8 +19,52 @@ export const MARKDOWN_BASE_URL_PLACEHOLDER = '%%MARKDOWN_BASE_URL%%'; * Original source: https://github.com/bouzuya/jekyll-markdown-parser * Repository archived on Jun 28, 2020 (read-only, no longer maintained) * - * SECURITY NOTE: - * -------------- + * ============================================================================ + * URL TRANSFORMATION SYSTEM + * ============================================================================ + * + * This parser handles two types of URL transformations: + * + * 1. IMAGES (baseUrl with MARKDOWN_BASE_URL_PLACEHOLDER) + * ------------------------------------------------------- + * Images use a placeholder that gets replaced at runtime by the Angular app. + * This allows serving images from different origins (CDN, local dev, etc.). + * + * Markdown: ![Alt](image.png) + * Build: + * Runtime: + * + * The placeholder is replaced in the Angular app based on environment config. + * This decouples the build from the deployment target. + * + * 2. LINKS (linkBasePath for relative → absolute transformation) + * --------------------------------------------------------------- + * Links are transformed from relative paths to absolute paths at build time. + * This is necessary because Angular uses which breaks + * relative anchor links (e.g., #section would navigate to /#section). + * + * Markdown: [Section](#section) + * Build: + * + * Markdown: [Other Post](../other-slug) + * Build: + * + * Markdown: [Other Section](../other-slug#intro) + * Build: + * + * The linkBasePath is derived from the folder structure: + * blog/my-slug/README.md → linkBasePath = "/blog/my-slug" + * + * WHY TWO DIFFERENT APPROACHES? + * - Images: Need runtime flexibility (CDN on prod, proxy during development) + * - Links: The Angular website mimics the folder structure of this repo. + * blog/ content is served at /blog/, material/ at /material/. + * That's why build-time resolution works: folder path = URL path. + * + * ============================================================================ + * SECURITY NOTE + * ============================================================================ + * * This parser does NOT sanitize or escape HTML content. Raw HTML in markdown * is passed through intentionally. This is a FEATURE, not a bug. * @@ -24,8 +73,10 @@ export const MARKDOWN_BASE_URL_PLACEHOLDER = '%%MARKDOWN_BASE_URL%%'; * All markdown content comes from our own Git repository. There is no * user-generated content. XSS is not a concern in this context. * - * CHANGES FROM ORIGINAL: - * ----------------------- + * ============================================================================ + * CHANGES FROM ORIGINAL + * ============================================================================ + * * 1. BUG FIX: Regex in separate() had typo `/^---s*$/` instead of `/^---\s*$/`. * This bug exists in the original bouzuya source code (never fixed). * The literal `s*` matches zero or more 's' characters, not whitespace. @@ -37,10 +88,13 @@ export const MARKDOWN_BASE_URL_PLACEHOLDER = '%%MARKDOWN_BASE_URL%%'; * 3. FEATURE: Added transformRelativeImagePaths() to handle raw HTML * tags that bypass the markdown renderer. * - * 4. CHANGE: Converted from CommonJS module to ES6 class with constructor - * for baseUrl injection. + * 4. FEATURE: Added transformRelativeLinks() to convert relative links to + * absolute paths, fixing issues in Angular. * - * 5. UPGRADE: marked v4 → v17 migration + * 5. CHANGE: Converted from CommonJS module to ES6 class with constructor + * for baseUrl and linkBasePath injection. + * + * 6. UPGRADE: marked v4 → v17 migration * - Using Marked class instance instead of global marked * - marked-highlight extension for syntax highlighting * - marked-gfm-heading-id extension for heading IDs @@ -51,7 +105,14 @@ export class JekyllMarkdownParser { private marked: Marked; - constructor(private baseUrl: string) { + /** + * @param baseUrl - Base URL for images (e.g., '%%MARKDOWN_BASE_URL%%/blog/my-slug/') + * @param linkBasePath - Absolute path for links (e.g., '/blog/my-slug') + */ + constructor( + private baseUrl: string, + private linkBasePath: string + ) { this.marked = this.createMarkedInstance(); } @@ -70,12 +131,17 @@ export class JekyllMarkdownParser { /** * Check if a URL is absolute (should not be transformed). - * Absolute URLs include: https://, http://, data:, //, assets/, / + * Matches: protocols (mailto:, tel:, https:, etc.), protocol-relative (//), + * absolute paths (/), asset paths, and placeholder URLs. */ private isAbsoluteUrl(url: string): boolean { - return url.startsWith('https://') || url.startsWith('http://') || - url.startsWith('data:') || url.startsWith('//') || - url.startsWith('assets/') || url.startsWith('/') || + // Protocol pattern: word characters followed by colon (mailto:, tel:, https:, http:, ftp:, data:, etc.) + if (/^\w+:/.test(url)) { + return true; + } + return url.startsWith('//') || + url.startsWith('/') || + url.startsWith('assets/') || url.startsWith(MARKDOWN_BASE_URL_PLACEHOLDER); } @@ -131,6 +197,33 @@ export class JekyllMarkdownParser { }); } + /** + * Transform relative links to absolute paths. + * Fixes issue where #anchor resolves to /#anchor. + * + * Uses path.posix.resolve() for proper relative path resolution: + * - #section → /blog/my-slug#section + * - ../other-slug → /blog/other-slug + * - ../other-slug#section → /blog/other-slug#section + */ + private transformRelativeLinks(html: string): string { + return html.replace(/]*)\shref=(["'])([^"']+)\2/g, (match, attrs, quote, href) => { + if (this.isAbsoluteUrl(href)) { + return match; + } + + const hasHash = href.includes('#'); + const [pathPart, hash] = hasHash ? href.split('#') : [href, '']; + + const resolved = pathPart + ? path.resolve(this.linkBasePath + '/', pathPart) + : this.linkBasePath; + + const newHref = hasHash ? resolved + '#' + hash : resolved; + return ` { From d0ccffc551f4df8c639595a54ce42d057bd74279 Mon Sep 17 00:00:00 2001 From: Johannes Hoppe Date: Sat, 7 Feb 2026 17:28:21 +0100 Subject: [PATCH 02/16] feat: add automatic TOC generation with [[toc]] marker - Add TOC_MARKER constant for [[toc]] placeholder - Add generateToc() using getHeadingList() from marked-gfm-heading-id - Add decodeHtmlEntities() for proper heading text display - Include h2 and h3 headings, skip headings before marker - Generate nested markdown list that gets processed normally Also includes: - Tests for mailto:, tel:, ftp:// links (not transformed) - Tests for TOC generation (6 new tests) --- shared/jekyll-markdown-parser.spec.ts | 146 +++++++++++++++++++++++++- shared/jekyll-markdown-parser.ts | 80 +++++++++++++- 2 files changed, 223 insertions(+), 3 deletions(-) diff --git a/shared/jekyll-markdown-parser.spec.ts b/shared/jekyll-markdown-parser.spec.ts index 37f868b..e77f9c9 100644 --- a/shared/jekyll-markdown-parser.spec.ts +++ b/shared/jekyll-markdown-parser.spec.ts @@ -3,7 +3,7 @@ import { Marked } from 'marked'; import { markedHighlight } from 'marked-highlight'; import { gfmHeadingId } from 'marked-gfm-heading-id'; import hljs from 'highlight.js'; -import { JekyllMarkdownParser, MARKDOWN_BASE_URL_PLACEHOLDER } from './jekyll-markdown-parser'; +import { JekyllMarkdownParser, MARKDOWN_BASE_URL_PLACEHOLDER, TOC_MARKER } from './jekyll-markdown-parser'; /** * Create a Marked instance with the same extensions as JekyllMarkdownParser. @@ -1349,5 +1349,149 @@ title: Test expect(result.html).toContain('src="https://example.com/blog/my-post/image.png"'); }); }); + + describe('Table of Contents (TOC) generation', () => { + it('should replace ${TOC_MARKER} marker with generated TOC', () => { + const input = `--- +title: Test +--- + +## Inhalt + +${TOC_MARKER} + +## Einleitung + +Text. + +## Fazit + +End. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + // TOC should contain links to headings after the marker + expect(result.html).toContain('href="/blog/my-post#einleitung"'); + expect(result.html).toContain('href="/blog/my-post#fazit"'); + // Should NOT contain the raw marker + expect(result.html).not.toContain('${TOC_MARKER}'); + }); + + it('should skip headings before ${TOC_MARKER} marker', () => { + const input = `--- +title: Test +--- + +## Inhalt + +${TOC_MARKER} + +## Hauptteil + +Text. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + // "Inhalt" heading should NOT be in the TOC links + expect(result.html).not.toContain('>Inhalt'); + // But "Hauptteil" should be in TOC + expect(result.html).toContain('href="/blog/my-post#hauptteil"'); + }); + + it('should include h2 and h3 headings with proper nesting', () => { + const input = `--- +title: Test +--- + +## Inhalt + +${TOC_MARKER} + +## Kapitel 1 + +Text. + +### Unterkapitel 1.1 + +More text. + +## Kapitel 2 + +End. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="/blog/my-post#kapitel-1"'); + expect(result.html).toContain('href="/blog/my-post#unterkapitel-11"'); + expect(result.html).toContain('href="/blog/my-post#kapitel-2"'); + }); + + it('should handle special characters in headings', () => { + const input = `--- +title: Test +--- + +## Inhalt + +${TOC_MARKER} + +## FAQ & Hilfe + +Text. + +## Über uns + +More. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="/blog/my-post#faq--hilfe"'); + // Note: marked URL-encodes non-ASCII chars in hrefs, but browser handles both + expect(result.html).toContain('href="/blog/my-post#%C3%BCber-uns"'); + // The link text should contain the original characters (HTML-escaped) + expect(result.html).toContain('>FAQ & Hilfe'); + expect(result.html).toContain('>Über uns'); + }); + + it('should work without ${TOC_MARKER} marker (no changes)', () => { + const input = `--- +title: Test +--- + +## Heading + +Text. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('

Heading

'); + expect(result.html).not.toContain('${TOC_MARKER}'); + }); + + it('should generate empty TOC when no headings after marker', () => { + const input = `--- +title: Test +--- + +## Inhalt + +${TOC_MARKER} + +Just text, no more headings. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + // Should not contain the marker + expect(result.html).not.toContain('${TOC_MARKER}'); + // TOC area should be essentially empty (just the Inhalt heading) + expect(result.html).toContain('

Inhalt

'); + }); + }); }); }); diff --git a/shared/jekyll-markdown-parser.ts b/shared/jekyll-markdown-parser.ts index 9c86d78..2035e42 100644 --- a/shared/jekyll-markdown-parser.ts +++ b/shared/jekyll-markdown-parser.ts @@ -2,7 +2,7 @@ import { posix as path } from 'path'; import { load } from 'js-yaml'; import { Marked, Renderer, Tokens } from 'marked'; import { markedHighlight } from 'marked-highlight'; -import { gfmHeadingId } from 'marked-gfm-heading-id'; +import { gfmHeadingId, getHeadingList, resetHeadings } from 'marked-gfm-heading-id'; import hljs from 'highlight.js'; /** @@ -11,6 +11,12 @@ import hljs from 'highlight.js'; */ export const MARKDOWN_BASE_URL_PLACEHOLDER = '%%MARKDOWN_BASE_URL%%'; +/** + * Marker for automatic table of contents generation. + * Place [[toc]] in your markdown and it will be replaced with a generated TOC. + */ +export const TOC_MARKER = '[[toc]]'; + /** * ============================================================================ * MODIFIED PARSER - Based on bouzuya/jekyll-markdown-parser @@ -163,6 +169,67 @@ export class JekyllMarkdownParser { .replace(/>/g, '>'); } + /** + * Decode common HTML entities back to their original characters. + * Used for TOC generation where we need plain text from marked's escaped output. + */ + private decodeHtmlEntities(text: string): string { + return text + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"') + .replace(/'/g, "'"); + } + + /** + * Generate a table of contents as Markdown from the document's headings. + * Uses getHeadingList() from marked-gfm-heading-id to get heading IDs. + * + * @param markdown - The markdown content to extract headings from + * @returns Markdown list with links to headings, or empty string if no headings + */ + private generateToc(markdown: string): string { + // Parse markdown to collect headings (result is discarded, we only need side effect) + resetHeadings(); + this.marked.parse(markdown); + const headings = getHeadingList(); + + // Filter to h2 and h3, skip headings that appear before [[toc]] marker + const tocIndex = markdown.indexOf(TOC_MARKER); + const headingsAfterMarker = headings.filter(h => { + // Only include h2 and h3 + if (h.level < 2 || h.level > 3) return false; + // Skip the heading that contains the TOC (usually "Inhalt" or "Contents") + const headingPattern = new RegExp(`^#{${h.level}}\\s+${this.escapeRegex(this.decodeHtmlEntities(h.text))}`, 'm'); + const match = markdown.match(headingPattern); + if (match && match.index !== undefined && match.index < tocIndex) { + return false; + } + return true; + }); + + if (headingsAfterMarker.length === 0) { + return ''; + } + + // Generate markdown list + return headingsAfterMarker + .map(h => { + const indent = h.level === 3 ? ' ' : ''; + const text = this.decodeHtmlEntities(h.text); + return `${indent}* [${text}](#${h.id})`; + }) + .join('\n'); + } + + /** + * Escape special regex characters in a string. + */ + private escapeRegex(str: string): string { + return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); + } + /** * Custom image renderer that transforms relative URLs to absolute URLs. * marked v17 uses token-based API: renderer receives a token object. @@ -251,7 +318,16 @@ export class JekyllMarkdownParser { } private compileMarkdown(markdown: string): string { - const html = this.marked.parse(markdown) as string; + // Generate TOC if marker is present + let processedMarkdown = markdown; + if (markdown.includes(TOC_MARKER)) { + const toc = this.generateToc(markdown); + processedMarkdown = markdown.replace(TOC_MARKER, toc); + } + + // Reset headings and parse (generateToc already parsed once, but we need fresh state) + resetHeadings(); + const html = this.marked.parse(processedMarkdown) as string; const withImages = this.transformRelativeImagePaths(html); return this.transformRelativeLinks(withImages); } From 5606832e72af520fb8b884170885f7bb80865dc9 Mon Sep 17 00:00:00 2001 From: Johannes Hoppe Date: Sat, 7 Feb 2026 17:35:02 +0100 Subject: [PATCH 03/16] fix: add missing type exports and update test parameters - Add HeadingData, getHeadingList, and resetHeadings exports to types.d.ts - Add missing linkBasePath parameter to markdownToEntry test calls --- shared/base.utils.spec.ts | 12 ++++++++---- shared/types.d.ts | 16 +++++++++++++++- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/shared/base.utils.spec.ts b/shared/base.utils.spec.ts index be45393..0e57335 100644 --- a/shared/base.utils.spec.ts +++ b/shared/base.utils.spec.ts @@ -250,7 +250,8 @@ describe('base.utils', () => { markdown, 'test-entry', 'https://example.com/', - '/non/existent/path' + '/non/existent/path', + '/blog/test-entry' )).rejects.toThrow(); }); @@ -261,7 +262,8 @@ describe('base.utils', () => { markdown, 'test-entry', 'https://example.com/', - '/tmp' + '/tmp', + '/blog/test-entry' ); // node-emoji converts :smile: to 😄 and :rocket: to 🚀 @@ -278,7 +280,8 @@ describe('base.utils', () => { markdown, 'test-entry', 'https://example.com/', - '/tmp' + '/tmp', + '/blog/test-entry' ); // js-yaml parses unquoted dates as Date objects, but we convert to ISO string @@ -293,7 +296,8 @@ describe('base.utils', () => { markdown, 'my-awesome-post', 'https://example.com/', - '/tmp' + '/tmp', + '/blog/my-awesome-post' ); expect(result.slug).toBe('my-awesome-post'); diff --git a/shared/types.d.ts b/shared/types.d.ts index 057d8dd..8a1120b 100644 --- a/shared/types.d.ts +++ b/shared/types.d.ts @@ -1,4 +1,18 @@ declare module 'marked-gfm-heading-id' { import type { MarkedExtension } from 'marked'; - export function gfmHeadingId(): MarkedExtension; + + interface GfmHeadingIdOptions { + prefix?: string; + } + + export interface HeadingData { + level: number; + text: string; + raw: string; + id: string; + } + + export function gfmHeadingId(options?: GfmHeadingIdOptions): MarkedExtension; + export function getHeadingList(): HeadingData[]; + export function resetHeadings(): void; } From 54a13bcfaab9a5fda8b4fcc7153a9ffc794bd9b8 Mon Sep 17 00:00:00 2001 From: Johannes Hoppe Date: Sat, 7 Feb 2026 17:39:18 +0100 Subject: [PATCH 04/16] docs: comprehensive README with feature documentation - Document image URL transformation (placeholder system) - Document link transformation (relative to absolute) - Document TOC generation with [[toc]] marker - Add YAML frontmatter reference - Add architecture overview - Add submodule warning --- README.md | 298 +++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 263 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 328225a..41ebf9a 100644 --- a/README.md +++ b/README.md @@ -1,58 +1,286 @@ # website-articles-build -Shared build scripts for processing Markdown articles into JSON. +Build-System für Blog- und Material-Artikel. Transformiert Markdown zu JSON für die Angular-Websites. -Used as a git subtree in: +Wird als Git-Submodule verwendet in: - [angular-buch/website-articles](https://github.com/angular-buch/website-articles) - [angular-schule/website-articles](https://github.com/angular-schule/website-articles) -## Usage +## Setup ```bash npm install -npm run build +npm run build # Einmaliger Build +npm run watch # Watch-Mode für Entwicklung +npm test # Tests ausführen +npm run typecheck # TypeScript prüfen ``` -## Scripts - -| Script | Description | -|--------------|--------------------------------------| -| `build` | Build blog and material entries | -| `test` | Run tests | -| `test:watch` | Run tests in watch mode | -| `typecheck` | TypeScript type checking | -| `watch` | Watch mode for development | - -## Folder Structure +## Projekt-Struktur ``` -├── build.ts # Main entry point +website-articles-build/ +├── build.ts # Haupt-Build-Script ├── blog/ -│ ├── blog.types.ts # Blog-specific types -│ └── blog.utils.ts # Blog list utilities +│ ├── blog.types.ts # Blog-spezifische Typen +│ └── blog.utils.ts # Blog-spezifische Utilities ├── material/ -│ └── material.types.ts # Material-specific types +│ └── material.types.ts # Material-spezifische Typen └── shared/ - ├── base.types.ts # Shared base types - ├── base.utils.ts # File/folder utilities - ├── list.utils.ts # List extraction utilities - └── jekyll-markdown-parser.ts # Markdown parser + ├── jekyll-markdown-parser.ts # Markdown-Parser + ├── base.utils.ts # Gemeinsame Utilities + └── list.utils.ts # Listen-Utilities +``` + +## Output + +Der Build erzeugt für jeden Artikel: + +| Output | Beschreibung | +|--------|--------------| +| `dist/blog/{slug}/entry.json` | Vollständiger Artikel mit HTML | +| `dist/blog/list.json` | Liste aller Artikel (Light-Version) | +| `dist/material/{slug}/entry.json` | Vollständiger Material-Eintrag | +| `dist/material/list.json` | Liste aller Material-Einträge | + +--- + +## Features für Markdown-Autoren + +### 1. Bilder + +Relative Bildpfade werden automatisch transformiert: + +```markdown +![Screenshot](screenshot.png) +![Logo](./images/logo.png) +``` + +**Build-Output:** +```html + +``` + +Der Placeholder `%%MARKDOWN_BASE_URL%%` wird zur Laufzeit durch die Angular-App ersetzt (CDN auf Prod, Proxy in Dev). + +**Nicht transformiert werden:** +- Absolute URLs: `https://example.com/image.png` +- Protokoll-relative URLs: `//cdn.example.com/image.png` +- Asset-Pfade: `assets/img/icon.svg` +- Absolute Pfade: `/images/logo.png` +- Data-URIs: `data:image/png;base64,...` + +### 2. Links + +Relative Links werden zu absoluten Pfaden transformiert. Das ist notwendig, weil Angular `` verwendet. + +#### Anker-Links (TOC) + +```markdown +[Einleitung](#einleitung) +``` + +**Build-Output:** +```html +Einleitung ``` -## URL Placeholder +#### Cross-Article Links -Generated URLs use `%%MARKDOWN_BASE_URL%%` as a placeholder: -- `%%MARKDOWN_BASE_URL%%/blog/2024-post/image.png` -- `%%MARKDOWN_BASE_URL%%/material/chapter-1/diagram.svg` +```markdown +[Anderer Artikel](../other-article) +[Anderer Artikel mit Anker](../other-article#setup) +``` + +**Build-Output:** +```html +Anderer Artikel +Anderer Artikel mit Anker +``` + +**Nicht transformiert werden:** +- Absolute URLs: `https://angular.io/docs` +- Bereits absolute Pfade: `/blog/other-article` +- mailto: `mailto:team@example.com` +- tel: `tel:+49123456` +- ftp: `ftp://files.example.com/file.zip` + +### 3. Automatisches Inhaltsverzeichnis (TOC) + +Platziere `[[toc]]` im Markdown, um ein automatisches Inhaltsverzeichnis zu generieren. + +#### Beispiel + +```markdown +--- +title: Mein Artikel +published: 2024-01-15 +--- + +## Inhalt + +[[toc]] + +## Einleitung -The consuming website replaces this placeholder with the actual base URL at runtime. +Lorem ipsum... -## Input/Output +### Unterkapitel -**Input:** `../blog/` and `../material/` folders with Markdown READMEs +Mehr Text... -**Output:** `./dist/` folder with: -- `dist/blog/list.json` - Light blog list for overview -- `dist/blog/{slug}/entry.json` - Full blog entry -- `dist/material/list.json` - Light material list -- `dist/material/{slug}/entry.json` - Full material entry +## Fazit + +Ende. +``` + +#### Generierter Output + +```html +

Inhalt

+ +``` + +#### Regeln + +| Regel | Beschreibung | +|-------|--------------| +| **Nur h2 und h3** | h1 und h4+ werden ignoriert | +| **Nach dem Marker** | Headings vor `[[toc]]` werden übersprungen | +| **Automatische IDs** | Heading-IDs werden von `marked-gfm-heading-id` generiert | +| **Sonderzeichen** | `Über uns` → `#%C3%BCber-uns`, `FAQ & Hilfe` → `#faq--hilfe` | + +### 4. Syntax-Highlighting + +Code-Blöcke werden automatisch mit highlight.js formatiert: + +````markdown +```typescript +const greeting = 'Hello World'; +console.log(greeting); +``` +```` + +### 5. Raw HTML + +HTML im Markdown wird unverändert durchgereicht: + +```markdown +
+

Custom styled content

+
+ + +``` + +**Sicherheitshinweis:** Das ist beabsichtigt. Wir vertrauen unserem eigenen Repository. Es gibt keinen User-Generated Content. + +### 6. Emojis + +Emoji-Shortcodes werden zu Unicode konvertiert: + +```markdown +Hello :smile: World :rocket: +``` + +**Output:** Hello 😄 World 🚀 + +--- + +## YAML Frontmatter + +Jeder Artikel benötigt YAML Frontmatter: + +```yaml +--- +title: "Artikel-Titel" +author: Max Mustermann +mail: max@example.com +published: 2024-01-15 +language: de +header: header.jpg +keywords: + - Angular + - TypeScript +# Optional: +lastModified: 2024-02-01 +hidden: false # Artikel nicht in Liste anzeigen +sticky: false # Artikel oben anpinnen +darkenHeader: false +author2: Co-Autor +mail2: co@example.com +bio: Kurze Bio des Autors +--- +``` + +### Datum-Formate + +Beide Formate werden unterstützt: + +```yaml +published: 2024-01-15 # Wird zu ISO-String konvertiert +published: "2024-01-15T10:00:00Z" # Bleibt als String +``` + +--- + +## Entwicklung + +### Tests + +```bash +npm test # Einmalig +npm run test:watch # Watch-Mode +``` + +131 Tests decken ab: +- Markdown-Parsing und HTML-Generierung +- Bild- und Link-Transformation +- TOC-Generierung +- Edge Cases (mailto, tel, CRLF, etc.) + +### TypeScript + +```bash +npm run typecheck # Typen prüfen +``` + +### Architektur + +``` +Markdown (README.md) + ↓ +JekyllMarkdownParser + ├── YAML Frontmatter → parsedYaml + ├── Markdown → marked → HTML + ├── Image URLs → transformiert mit Placeholder + ├── Links → transformiert zu absoluten Pfaden + └── TOC → generiert aus Headings + ↓ +entry.json +``` + +--- + +## Submodule-Hinweis + +Dieses Repository wird als Git-Submodule in `website-articles` eingebunden. + +**Änderungen immer hier vornehmen**, nicht im `build/`-Ordner des Parent-Repos! + +```bash +# RICHTIG: Hier arbeiten +cd website-articles-build +git checkout -b feature/xyz + +# FALSCH: Nicht im Submodule arbeiten +cd website-articles/build # ❌ +``` From bdee04454dc880569f41ff0f48690bdd48491b5d Mon Sep 17 00:00:00 2001 From: Johannes Hoppe Date: Sat, 7 Feb 2026 17:42:40 +0100 Subject: [PATCH 05/16] docs: clarify Angular website reference --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 41ebf9a..42a3453 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,7 @@ Der Placeholder `%%MARKDOWN_BASE_URL%%` wird zur Laufzeit durch die Angular-App ### 2. Links -Relative Links werden zu absoluten Pfaden transformiert. Das ist notwendig, weil Angular `` verwendet. +Relative Links werden zu absoluten Pfaden transformiert. Das ist notwendig, weil unsere Angular-Website `` verwendet. #### Anker-Links (TOC) From 618deabfddb24084b1bf7e50f31da87b58eb89e1 Mon Sep 17 00:00:00 2001 From: Johannes Hoppe Date: Sat, 7 Feb 2026 17:43:42 +0100 Subject: [PATCH 06/16] docs: translate README to English --- README.md | 202 +++++++++++++++++++++++++++--------------------------- 1 file changed, 101 insertions(+), 101 deletions(-) diff --git a/README.md b/README.md index 42a3453..a42d733 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # website-articles-build -Build-System für Blog- und Material-Artikel. Transformiert Markdown zu JSON für die Angular-Websites. +Build system for blog and material articles. Transforms Markdown to JSON for Angular websites. -Wird als Git-Submodule verwendet in: +Used as a Git submodule in: - [angular-buch/website-articles](https://github.com/angular-buch/website-articles) - [angular-schule/website-articles](https://github.com/angular-schule/website-articles) @@ -10,157 +10,157 @@ Wird als Git-Submodule verwendet in: ```bash npm install -npm run build # Einmaliger Build -npm run watch # Watch-Mode für Entwicklung -npm test # Tests ausführen -npm run typecheck # TypeScript prüfen +npm run build # Single build +npm run watch # Watch mode for development +npm test # Run tests +npm run typecheck # TypeScript check ``` -## Projekt-Struktur +## Project Structure ``` website-articles-build/ -├── build.ts # Haupt-Build-Script +├── build.ts # Main build script ├── blog/ -│ ├── blog.types.ts # Blog-spezifische Typen -│ └── blog.utils.ts # Blog-spezifische Utilities +│ ├── blog.types.ts # Blog-specific types +│ └── blog.utils.ts # Blog-specific utilities ├── material/ -│ └── material.types.ts # Material-spezifische Typen +│ └── material.types.ts # Material-specific types └── shared/ - ├── jekyll-markdown-parser.ts # Markdown-Parser - ├── base.utils.ts # Gemeinsame Utilities - └── list.utils.ts # Listen-Utilities + ├── jekyll-markdown-parser.ts # Markdown parser + ├── base.utils.ts # Shared utilities + └── list.utils.ts # List utilities ``` ## Output -Der Build erzeugt für jeden Artikel: +The build generates for each article: -| Output | Beschreibung | -|--------|--------------| -| `dist/blog/{slug}/entry.json` | Vollständiger Artikel mit HTML | -| `dist/blog/list.json` | Liste aller Artikel (Light-Version) | -| `dist/material/{slug}/entry.json` | Vollständiger Material-Eintrag | -| `dist/material/list.json` | Liste aller Material-Einträge | +| Output | Description | +|--------|-------------| +| `dist/blog/{slug}/entry.json` | Full article with HTML | +| `dist/blog/list.json` | List of all articles (light version) | +| `dist/material/{slug}/entry.json` | Full material entry | +| `dist/material/list.json` | List of all material entries | --- -## Features für Markdown-Autoren +## Features for Markdown Authors -### 1. Bilder +### 1. Images -Relative Bildpfade werden automatisch transformiert: +Relative image paths are automatically transformed: ```markdown ![Screenshot](screenshot.png) ![Logo](./images/logo.png) ``` -**Build-Output:** +**Build output:** ```html ``` -Der Placeholder `%%MARKDOWN_BASE_URL%%` wird zur Laufzeit durch die Angular-App ersetzt (CDN auf Prod, Proxy in Dev). +The placeholder `%%MARKDOWN_BASE_URL%%` is replaced at runtime by the Angular app (CDN on prod, proxy in dev). -**Nicht transformiert werden:** +**Not transformed:** - Absolute URLs: `https://example.com/image.png` -- Protokoll-relative URLs: `//cdn.example.com/image.png` -- Asset-Pfade: `assets/img/icon.svg` -- Absolute Pfade: `/images/logo.png` -- Data-URIs: `data:image/png;base64,...` +- Protocol-relative URLs: `//cdn.example.com/image.png` +- Asset paths: `assets/img/icon.svg` +- Absolute paths: `/images/logo.png` +- Data URIs: `data:image/png;base64,...` ### 2. Links -Relative Links werden zu absoluten Pfaden transformiert. Das ist notwendig, weil unsere Angular-Website `` verwendet. +Relative links are transformed to absolute paths. This is necessary because our Angular website uses ``. -#### Anker-Links (TOC) +#### Anchor Links (TOC) ```markdown -[Einleitung](#einleitung) +[Introduction](#introduction) ``` -**Build-Output:** +**Build output:** ```html -Einleitung +Introduction ``` #### Cross-Article Links ```markdown -[Anderer Artikel](../other-article) -[Anderer Artikel mit Anker](../other-article#setup) +[Other Article](../other-article) +[Other Article with Anchor](../other-article#setup) ``` -**Build-Output:** +**Build output:** ```html -Anderer Artikel -Anderer Artikel mit Anker +Other Article +Other Article with Anchor ``` -**Nicht transformiert werden:** +**Not transformed:** - Absolute URLs: `https://angular.io/docs` -- Bereits absolute Pfade: `/blog/other-article` +- Already absolute paths: `/blog/other-article` - mailto: `mailto:team@example.com` - tel: `tel:+49123456` - ftp: `ftp://files.example.com/file.zip` -### 3. Automatisches Inhaltsverzeichnis (TOC) +### 3. Automatic Table of Contents (TOC) -Platziere `[[toc]]` im Markdown, um ein automatisches Inhaltsverzeichnis zu generieren. +Place `[[toc]]` in your Markdown to generate an automatic table of contents. -#### Beispiel +#### Example ```markdown --- -title: Mein Artikel +title: My Article published: 2024-01-15 --- -## Inhalt +## Contents [[toc]] -## Einleitung +## Introduction Lorem ipsum... -### Unterkapitel +### Subchapter -Mehr Text... +More text... -## Fazit +## Conclusion -Ende. +End. ``` -#### Generierter Output +#### Generated Output ```html -

Inhalt

+

Contents

``` -#### Regeln +#### Rules -| Regel | Beschreibung | -|-------|--------------| -| **Nur h2 und h3** | h1 und h4+ werden ignoriert | -| **Nach dem Marker** | Headings vor `[[toc]]` werden übersprungen | -| **Automatische IDs** | Heading-IDs werden von `marked-gfm-heading-id` generiert | -| **Sonderzeichen** | `Über uns` → `#%C3%BCber-uns`, `FAQ & Hilfe` → `#faq--hilfe` | +| Rule | Description | +|------|-------------| +| **Only h2 and h3** | h1 and h4+ are ignored | +| **After the marker** | Headings before `[[toc]]` are skipped | +| **Automatic IDs** | Heading IDs are generated by `marked-gfm-heading-id` | +| **Special characters** | `About us` → `#about-us`, `FAQ & Help` → `#faq--help` | -### 4. Syntax-Highlighting +### 4. Syntax Highlighting -Code-Blöcke werden automatisch mit highlight.js formatiert: +Code blocks are automatically formatted with highlight.js: ````markdown ```typescript @@ -171,7 +171,7 @@ console.log(greeting); ### 5. Raw HTML -HTML im Markdown wird unverändert durchgereicht: +HTML in Markdown is passed through unchanged: ```markdown
@@ -181,11 +181,11 @@ HTML im Markdown wird unverändert durchgereicht: ``` -**Sicherheitshinweis:** Das ist beabsichtigt. Wir vertrauen unserem eigenen Repository. Es gibt keinen User-Generated Content. +**Security note:** This is intentional. We trust our own repository. There is no user-generated content. ### 6. Emojis -Emoji-Shortcodes werden zu Unicode konvertiert: +Emoji shortcodes are converted to Unicode: ```markdown Hello :smile: World :rocket: @@ -197,63 +197,63 @@ Hello :smile: World :rocket: ## YAML Frontmatter -Jeder Artikel benötigt YAML Frontmatter: +Every article requires YAML frontmatter: ```yaml --- -title: "Artikel-Titel" -author: Max Mustermann -mail: max@example.com +title: "Article Title" +author: John Doe +mail: john@example.com published: 2024-01-15 -language: de +language: en header: header.jpg keywords: - Angular - TypeScript # Optional: lastModified: 2024-02-01 -hidden: false # Artikel nicht in Liste anzeigen -sticky: false # Artikel oben anpinnen +hidden: false # Don't show article in list +sticky: false # Pin article to top darkenHeader: false -author2: Co-Autor +author2: Co-Author mail2: co@example.com -bio: Kurze Bio des Autors +bio: Short author bio --- ``` -### Datum-Formate +### Date Formats -Beide Formate werden unterstützt: +Both formats are supported: ```yaml -published: 2024-01-15 # Wird zu ISO-String konvertiert -published: "2024-01-15T10:00:00Z" # Bleibt als String +published: 2024-01-15 # Converted to ISO string +published: "2024-01-15T10:00:00Z" # Stays as string ``` --- -## Entwicklung +## Development ### Tests ```bash -npm test # Einmalig -npm run test:watch # Watch-Mode +npm test # Single run +npm run test:watch # Watch mode ``` -131 Tests decken ab: -- Markdown-Parsing und HTML-Generierung -- Bild- und Link-Transformation -- TOC-Generierung -- Edge Cases (mailto, tel, CRLF, etc.) +131 tests cover: +- Markdown parsing and HTML generation +- Image and link transformation +- TOC generation +- Edge cases (mailto, tel, CRLF, etc.) ### TypeScript ```bash -npm run typecheck # Typen prüfen +npm run typecheck # Type check ``` -### Architektur +### Architecture ``` Markdown (README.md) @@ -261,26 +261,26 @@ Markdown (README.md) JekyllMarkdownParser ├── YAML Frontmatter → parsedYaml ├── Markdown → marked → HTML - ├── Image URLs → transformiert mit Placeholder - ├── Links → transformiert zu absoluten Pfaden - └── TOC → generiert aus Headings + ├── Image URLs → transformed with placeholder + ├── Links → transformed to absolute paths + └── TOC → generated from headings ↓ entry.json ``` --- -## Submodule-Hinweis +## Submodule Warning -Dieses Repository wird als Git-Submodule in `website-articles` eingebunden. +This repository is included as a Git submodule in `website-articles`. -**Änderungen immer hier vornehmen**, nicht im `build/`-Ordner des Parent-Repos! +**Always make changes here**, not in the `build/` folder of the parent repo! ```bash -# RICHTIG: Hier arbeiten +# CORRECT: Work here cd website-articles-build git checkout -b feature/xyz -# FALSCH: Nicht im Submodule arbeiten +# WRONG: Don't work in the submodule cd website-articles/build # ❌ ``` From 5ed2b078a40d0bb1fb8cb6afc10f9664bfdead6c Mon Sep 17 00:00:00 2001 From: Johannes Hoppe Date: Sat, 7 Feb 2026 20:13:28 +0100 Subject: [PATCH 07/16] refactor: replace marked-gfm-heading-id with inline fork - Fork marked-gfm-heading-id to shared/gfm-heading-id/ - Convert to TypeScript with improved types - Simplified API (removed unused globalSlugs option) - Better entity handling (decode before slugging) - Replace marked-gfm-heading-id dependency with github-slugger - Remove now-empty types.d.ts - Update README with github-slugger reference The fork is only 100 lines and gives us full control over heading ID generation. All 131 tests pass. --- README.md | 4 +- package-lock.json | 18 +---- package.json | 2 +- shared/gfm-heading-id/index.ts | 110 ++++++++++++++++++++++++++ shared/jekyll-markdown-parser.spec.ts | 2 +- shared/jekyll-markdown-parser.ts | 2 +- shared/types.d.ts | 18 ----- 7 files changed, 118 insertions(+), 38 deletions(-) create mode 100644 shared/gfm-heading-id/index.ts delete mode 100644 shared/types.d.ts diff --git a/README.md b/README.md index a42d733..299479e 100644 --- a/README.md +++ b/README.md @@ -155,8 +155,8 @@ End. |------|-------------| | **Only h2 and h3** | h1 and h4+ are ignored | | **After the marker** | Headings before `[[toc]]` are skipped | -| **Automatic IDs** | Heading IDs are generated by `marked-gfm-heading-id` | -| **Special characters** | `About us` → `#about-us`, `FAQ & Help` → `#faq--help` | +| **Automatic IDs** | Heading IDs follow [GitHub's algorithm](https://github.com/Flet/github-slugger) | +| **Special characters** | Umlauts preserved (`Über uns` → `#über-uns`), `&` removed | ### 4. Syntax Highlighting diff --git a/package-lock.json b/package-lock.json index 08cc5d5..4408364 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,20 +1,20 @@ { - "name": "website-articles", + "name": "website-articles-build", "version": "1.0.0", "lockfileVersion": 3, "requires": true, "packages": { "": { - "name": "website-articles", + "name": "website-articles-build", "version": "1.0.0", "license": "ISC", "dependencies": { "fs-extra": "^11.2.0", + "github-slugger": "^2.0.0", "highlight.js": "^11.10.0", "image-size": "^2.0.2", "js-yaml": "^4.1.0", "marked": "^17.0.1", - "marked-gfm-heading-id": "^4.1.3", "marked-highlight": "^2.2.3", "node-emoji": "^2.1.3" }, @@ -723,18 +723,6 @@ "node": ">= 20" } }, - "node_modules/marked-gfm-heading-id": { - "version": "4.1.3", - "resolved": "https://registry.npmjs.org/marked-gfm-heading-id/-/marked-gfm-heading-id-4.1.3.tgz", - "integrity": "sha512-aR0i63LmFbuxU/gAgrgz1Ir+8HK6zAIFXMlckeKHpV+qKbYaOP95L4Ux5Gi+sKmCZU5qnN2rdKpvpb7PnUBIWg==", - "license": "MIT", - "dependencies": { - "github-slugger": "^2.0.0" - }, - "peerDependencies": { - "marked": ">=13 <18" - } - }, "node_modules/marked-highlight": { "version": "2.2.3", "resolved": "https://registry.npmjs.org/marked-highlight/-/marked-highlight-2.2.3.tgz", diff --git a/package.json b/package.json index 0c19d94..c87617e 100644 --- a/package.json +++ b/package.json @@ -16,8 +16,8 @@ "highlight.js": "^11.10.0", "image-size": "^2.0.2", "js-yaml": "^4.1.0", + "github-slugger": "^2.0.0", "marked": "^17.0.1", - "marked-gfm-heading-id": "^4.1.3", "marked-highlight": "^2.2.3", "node-emoji": "^2.1.3" }, diff --git a/shared/gfm-heading-id/index.ts b/shared/gfm-heading-id/index.ts new file mode 100644 index 0000000..4b401e0 --- /dev/null +++ b/shared/gfm-heading-id/index.ts @@ -0,0 +1,110 @@ +/** + * GitHub Flavored Markdown Heading ID Extension for Marked + * + * Forked from: https://github.com/markedjs/marked-gfm-heading-id (v4.1.3) + * Original license: MIT + * + * Changes from original: + * - Converted to TypeScript + * - Simplified API (removed globalSlugs option - we always reset per document) + * - Improved entity handling (decode before slugging, not after) + * - Added HeadingData export for TOC generation + */ + +import GithubSlugger from 'github-slugger'; +import type { MarkedExtension, Tokens } from 'marked'; + +export interface HeadingData { + level: number; + /** The heading text (may contain HTML entities from marked) */ + text: string; + /** The raw heading text (HTML tags stripped, entities decoded) */ + raw: string; + /** The generated slug ID */ + id: string; +} + +let slugger = new GithubSlugger(); +let headings: HeadingData[] = []; + +/** + * Decode HTML entities to their original characters. + * Marked escapes special chars in heading text, we need to decode for slugging. + */ +function decodeHtmlEntities(html: string): string { + return html + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/'/g, "'") + .replace(///g, '/'); +} + +/** + * Strip HTML tags from text. + * Used to get plain text from heading content. + */ +function stripHtmlTags(html: string): string { + return html.replace(/<[^>]*>/g, ''); +} + +/** + * Create a marked extension that adds GitHub-style heading IDs. + * + * @param options.prefix - Optional prefix for all heading IDs + * @returns MarkedExtension to pass to marked.use() + * + * @example + * ```typescript + * const marked = new Marked(gfmHeadingId()); + * marked.parse('# Hello World'); + * //

Hello World

+ * ``` + */ +export function gfmHeadingId({ prefix = '' } = {}): MarkedExtension { + return { + hooks: { + preprocess(src: string): string { + // Always reset for each document (we process one doc at a time) + resetHeadings(); + return src; + }, + }, + renderer: { + heading({ tokens, depth }: Tokens.Heading): string { + // Get the rendered HTML text (may contain HTML entities and tags) + // @ts-ignore - 'this' context is provided by marked at runtime + const text: string = this.parser.parseInline(tokens); + + // Get raw text: decode entities, strip HTML tags + const raw = stripHtmlTags(decodeHtmlEntities(text)).trim(); + + const level = depth; + const id = `${prefix}${slugger.slug(raw)}`; + + headings.push({ level, text, id, raw }); + + return `${text}\n`; + }, + }, + }; +} + +/** + * Get the list of headings collected during the last parse. + * Call this after marked.parse() to get heading data for TOC generation. + */ +export function getHeadingList(): HeadingData[] { + return headings; +} + +/** + * Reset the heading list and slugger. + * Called automatically in preprocess hook, but can be called manually if needed. + */ +export function resetHeadings(): void { + headings = []; + slugger = new GithubSlugger(); +} diff --git a/shared/jekyll-markdown-parser.spec.ts b/shared/jekyll-markdown-parser.spec.ts index e77f9c9..795d112 100644 --- a/shared/jekyll-markdown-parser.spec.ts +++ b/shared/jekyll-markdown-parser.spec.ts @@ -1,7 +1,7 @@ import { describe, it, expect } from 'vitest'; import { Marked } from 'marked'; import { markedHighlight } from 'marked-highlight'; -import { gfmHeadingId } from 'marked-gfm-heading-id'; +import { gfmHeadingId } from './gfm-heading-id'; import hljs from 'highlight.js'; import { JekyllMarkdownParser, MARKDOWN_BASE_URL_PLACEHOLDER, TOC_MARKER } from './jekyll-markdown-parser'; diff --git a/shared/jekyll-markdown-parser.ts b/shared/jekyll-markdown-parser.ts index 2035e42..74dfa9f 100644 --- a/shared/jekyll-markdown-parser.ts +++ b/shared/jekyll-markdown-parser.ts @@ -2,7 +2,7 @@ import { posix as path } from 'path'; import { load } from 'js-yaml'; import { Marked, Renderer, Tokens } from 'marked'; import { markedHighlight } from 'marked-highlight'; -import { gfmHeadingId, getHeadingList, resetHeadings } from 'marked-gfm-heading-id'; +import { gfmHeadingId, getHeadingList, resetHeadings } from './gfm-heading-id'; import hljs from 'highlight.js'; /** diff --git a/shared/types.d.ts b/shared/types.d.ts deleted file mode 100644 index 8a1120b..0000000 --- a/shared/types.d.ts +++ /dev/null @@ -1,18 +0,0 @@ -declare module 'marked-gfm-heading-id' { - import type { MarkedExtension } from 'marked'; - - interface GfmHeadingIdOptions { - prefix?: string; - } - - export interface HeadingData { - level: number; - text: string; - raw: string; - id: string; - } - - export function gfmHeadingId(options?: GfmHeadingIdOptions): MarkedExtension; - export function getHeadingList(): HeadingData[]; - export function resetHeadings(): void; -} From afdfc3d0e6318dc2727db2e8c3c8c2049619e446 Mon Sep 17 00:00:00 2001 From: Johannes Hoppe Date: Sat, 7 Feb 2026 20:15:03 +0100 Subject: [PATCH 08/16] refactor: simplify TOC generation using h.raw from fork - Remove decodeHtmlEntities() from JekyllMarkdownParser (now in fork) - Use h.raw directly instead of decoding h.text - Update comment to reference our gfm-heading-id fork The fork provides heading.raw which is already decoded and stripped of HTML tags, eliminating duplicate code. --- shared/jekyll-markdown-parser.ts | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/shared/jekyll-markdown-parser.ts b/shared/jekyll-markdown-parser.ts index 74dfa9f..11cf54c 100644 --- a/shared/jekyll-markdown-parser.ts +++ b/shared/jekyll-markdown-parser.ts @@ -169,22 +169,9 @@ export class JekyllMarkdownParser { .replace(/>/g, '>'); } - /** - * Decode common HTML entities back to their original characters. - * Used for TOC generation where we need plain text from marked's escaped output. - */ - private decodeHtmlEntities(text: string): string { - return text - .replace(/&/g, '&') - .replace(/</g, '<') - .replace(/>/g, '>') - .replace(/"/g, '"') - .replace(/'/g, "'"); - } - /** * Generate a table of contents as Markdown from the document's headings. - * Uses getHeadingList() from marked-gfm-heading-id to get heading IDs. + * Uses getHeadingList() from our gfm-heading-id fork. * * @param markdown - The markdown content to extract headings from * @returns Markdown list with links to headings, or empty string if no headings @@ -201,7 +188,8 @@ export class JekyllMarkdownParser { // Only include h2 and h3 if (h.level < 2 || h.level > 3) return false; // Skip the heading that contains the TOC (usually "Inhalt" or "Contents") - const headingPattern = new RegExp(`^#{${h.level}}\\s+${this.escapeRegex(this.decodeHtmlEntities(h.text))}`, 'm'); + // h.raw is already decoded (no HTML entities) thanks to our gfm-heading-id fork + const headingPattern = new RegExp(`^#{${h.level}}\\s+${this.escapeRegex(h.raw)}`, 'm'); const match = markdown.match(headingPattern); if (match && match.index !== undefined && match.index < tocIndex) { return false; @@ -217,8 +205,7 @@ export class JekyllMarkdownParser { return headingsAfterMarker .map(h => { const indent = h.level === 3 ? ' ' : ''; - const text = this.decodeHtmlEntities(h.text); - return `${indent}* [${text}](#${h.id})`; + return `${indent}* [${h.raw}](#${h.id})`; }) .join('\n'); } From 8e17e61dac9d04b47a5e9fc66fd32169d853a4fb Mon Sep 17 00:00:00 2001 From: Johannes Hoppe Date: Sat, 7 Feb 2026 20:25:53 +0100 Subject: [PATCH 09/16] refactor: major cleanup and simplification Code Quality Improvements: - Extract shared utilities to html.utils.ts (stripHtmlTags, decodeHtmlEntities, escapeHtml) - Precompile regex patterns for better performance - Simplify parse() return type (remove unused yaml/markdown fields) - Update documentation to reference our gfm-heading-id fork TOC Generation Simplification: - Replace complex position-tracking algorithm with simple split approach - Split markdown at [[toc]] marker, parse only content after it - Reduces generateToc from 60+ lines to 30 lines - More robust: no regex matching of heading text needed Net result: -40 lines of code, cleaner architecture, same functionality. All 131 tests pass. --- shared/gfm-heading-id/index.ts | 24 +------- shared/html.utils.ts | 35 +++++++++++ shared/jekyll-markdown-parser.spec.ts | 3 +- shared/jekyll-markdown-parser.ts | 87 +++++++++++---------------- shared/list.utils.ts | 2 +- 5 files changed, 73 insertions(+), 78 deletions(-) create mode 100644 shared/html.utils.ts diff --git a/shared/gfm-heading-id/index.ts b/shared/gfm-heading-id/index.ts index 4b401e0..310eaf0 100644 --- a/shared/gfm-heading-id/index.ts +++ b/shared/gfm-heading-id/index.ts @@ -13,6 +13,7 @@ import GithubSlugger from 'github-slugger'; import type { MarkedExtension, Tokens } from 'marked'; +import { decodeHtmlEntities, stripHtmlTags } from '../html.utils'; export interface HeadingData { level: number; @@ -27,29 +28,6 @@ export interface HeadingData { let slugger = new GithubSlugger(); let headings: HeadingData[] = []; -/** - * Decode HTML entities to their original characters. - * Marked escapes special chars in heading text, we need to decode for slugging. - */ -function decodeHtmlEntities(html: string): string { - return html - .replace(/&/g, '&') - .replace(/</g, '<') - .replace(/>/g, '>') - .replace(/"/g, '"') - .replace(/'/g, "'") - .replace(/'/g, "'") - .replace(///g, '/'); -} - -/** - * Strip HTML tags from text. - * Used to get plain text from heading content. - */ -function stripHtmlTags(html: string): string { - return html.replace(/<[^>]*>/g, ''); -} - /** * Create a marked extension that adds GitHub-style heading IDs. * diff --git a/shared/html.utils.ts b/shared/html.utils.ts new file mode 100644 index 0000000..213aeef --- /dev/null +++ b/shared/html.utils.ts @@ -0,0 +1,35 @@ +/** + * Shared HTML utility functions. + */ + +/** + * Strip all HTML tags from a string, leaving only text content. + */ +export function stripHtmlTags(html: string): string { + return html.replace(/<[^>]*>/g, ''); +} + +/** + * Decode common HTML entities to their original characters. + */ +export function decodeHtmlEntities(html: string): string { + return html + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/'/g, "'") + .replace(///g, '/'); +} + +/** + * Escape special HTML characters for use in attribute values. + */ +export function escapeHtml(text: string): string { + return text + .replace(/&/g, '&') + .replace(/"/g, '"') + .replace(//g, '>'); +} diff --git a/shared/jekyll-markdown-parser.spec.ts b/shared/jekyll-markdown-parser.spec.ts index 795d112..9fb9ab1 100644 --- a/shared/jekyll-markdown-parser.spec.ts +++ b/shared/jekyll-markdown-parser.spec.ts @@ -484,13 +484,12 @@ author: John Doe This is a test. `; const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); - const { parsedYaml, html, markdown } = parser.parse(input); + const { parsedYaml, html } = parser.parse(input); expect(parsedYaml.title).toBe('Test Post'); expect(parsedYaml.author).toBe('John Doe'); expect(html).toContain('

Hello World

'); expect(html).toContain('

This is a test.

'); - expect(markdown).toBe('\n# Hello World\n\nThis is a test.\n'); }); it('should throw for markdown without frontmatter', () => { diff --git a/shared/jekyll-markdown-parser.ts b/shared/jekyll-markdown-parser.ts index 11cf54c..93b2529 100644 --- a/shared/jekyll-markdown-parser.ts +++ b/shared/jekyll-markdown-parser.ts @@ -4,6 +4,12 @@ import { Marked, Renderer, Tokens } from 'marked'; import { markedHighlight } from 'marked-highlight'; import { gfmHeadingId, getHeadingList, resetHeadings } from './gfm-heading-id'; import hljs from 'highlight.js'; +import { escapeHtml } from './html.utils'; + +// Precompiled regexes for performance +const PROTOCOL_REGEX = /^\w+:/; +const IMG_SRC_REGEX = /]*)\ssrc=(["'])([^"']+)\2/g; +const ANCHOR_HREF_REGEX = /]*)\shref=(["'])([^"']+)\2/g; /** * Placeholder for image base URL. Replaced at runtime by the Angular app. @@ -103,8 +109,11 @@ export const TOC_MARKER = '[[toc]]'; * 6. UPGRADE: marked v4 → v17 migration * - Using Marked class instance instead of global marked * - marked-highlight extension for syntax highlighting - * - marked-gfm-heading-id extension for heading IDs + * - Custom gfm-heading-id fork for heading IDs (see ./gfm-heading-id/) * - Token-based renderer API (token object instead of separate params) + * + * 7. REFACTOR: Shared utilities extracted to ./html.utils.ts + * - escapeHtml, decodeHtmlEntities, stripHtmlTags * ============================================================================ */ export class JekyllMarkdownParser { @@ -142,7 +151,7 @@ export class JekyllMarkdownParser { */ private isAbsoluteUrl(url: string): boolean { // Protocol pattern: word characters followed by colon (mailto:, tel:, https:, http:, ftp:, data:, etc.) - if (/^\w+:/.test(url)) { + if (PROTOCOL_REGEX.test(url)) { return true; } return url.startsWith('//') || @@ -158,51 +167,36 @@ export class JekyllMarkdownParser { return url.startsWith('./') ? url.slice(2) : url; } - /** - * Escape special HTML characters in attribute values. - */ - private escapeHtml(text: string): string { - return text - .replace(/&/g, '&') - .replace(/"/g, '"') - .replace(//g, '>'); - } - /** * Generate a table of contents as Markdown from the document's headings. - * Uses getHeadingList() from our gfm-heading-id fork. + * Only includes headings that appear AFTER the [[toc]] marker. * * @param markdown - The markdown content to extract headings from * @returns Markdown list with links to headings, or empty string if no headings */ private generateToc(markdown: string): string { - // Parse markdown to collect headings (result is discarded, we only need side effect) + // Split at marker - only parse content AFTER the marker + const parts = markdown.split(TOC_MARKER); + if (parts.length < 2) { + return ''; + } + + const contentAfterMarker = parts.slice(1).join(TOC_MARKER); // Handle multiple markers (edge case) + + // Parse only the part after [[toc]] to collect headings resetHeadings(); - this.marked.parse(markdown); + this.marked.parse(contentAfterMarker); const headings = getHeadingList(); - // Filter to h2 and h3, skip headings that appear before [[toc]] marker - const tocIndex = markdown.indexOf(TOC_MARKER); - const headingsAfterMarker = headings.filter(h => { - // Only include h2 and h3 - if (h.level < 2 || h.level > 3) return false; - // Skip the heading that contains the TOC (usually "Inhalt" or "Contents") - // h.raw is already decoded (no HTML entities) thanks to our gfm-heading-id fork - const headingPattern = new RegExp(`^#{${h.level}}\\s+${this.escapeRegex(h.raw)}`, 'm'); - const match = markdown.match(headingPattern); - if (match && match.index !== undefined && match.index < tocIndex) { - return false; - } - return true; - }); + // Filter to h2 and h3 only + const relevantHeadings = headings.filter(h => h.level >= 2 && h.level <= 3); - if (headingsAfterMarker.length === 0) { + if (relevantHeadings.length === 0) { return ''; } // Generate markdown list - return headingsAfterMarker + return relevantHeadings .map(h => { const indent = h.level === 3 ? ' ' : ''; return `${indent}* [${h.raw}](#${h.id})`; @@ -210,13 +204,6 @@ export class JekyllMarkdownParser { .join('\n'); } - /** - * Escape special regex characters in a string. - */ - private escapeRegex(str: string): string { - return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); - } - /** * Custom image renderer that transforms relative URLs to absolute URLs. * marked v17 uses token-based API: renderer receives a token object. @@ -231,10 +218,10 @@ export class JekyllMarkdownParser { src = this.baseUrl + this.normalizeRelativeUrl(token.href); } - const escapedAlt = this.escapeHtml(token.text); + const escapedAlt = escapeHtml(token.text); let out = `${escapedAlt} tags to absolute URLs // Supports both double quotes (src="...") and single quotes (src='...') private transformRelativeImagePaths(html: string): string { - return html.replace(/]*)\ssrc=(["'])([^"']+)\2/g, (match, attrs, quote, src) => { + return html.replace(IMG_SRC_REGEX, (match, attrs, quote, src) => { if (this.isAbsoluteUrl(src)) { return match; } @@ -261,7 +248,7 @@ export class JekyllMarkdownParser { * - ../other-slug#section → /blog/other-slug#section */ private transformRelativeLinks(html: string): string { - return html.replace(/]*)\shref=(["'])([^"']+)\2/g, (match, attrs, quote, href) => { + return html.replace(ANCHOR_HREF_REGEX, (match, attrs, quote, href) => { if (this.isAbsoluteUrl(href)) { return match; } @@ -306,13 +293,16 @@ export class JekyllMarkdownParser { private compileMarkdown(markdown: string): string { // Generate TOC if marker is present + // Note: This parses twice when TOC is present - once to collect headings, once for final HTML. + // This is intentional: we need heading data before generating TOC, but TOC must be in the + // document before final parsing. The overhead is minimal for typical blog post sizes. let processedMarkdown = markdown; if (markdown.includes(TOC_MARKER)) { const toc = this.generateToc(markdown); processedMarkdown = markdown.replace(TOC_MARKER, toc); } - // Reset headings and parse (generateToc already parsed once, but we need fresh state) + // Reset headings for clean state (generateToc may have populated them) resetHeadings(); const html = this.marked.parse(processedMarkdown) as string; const withImages = this.transformRelativeImagePaths(html); @@ -329,19 +319,12 @@ export class JekyllMarkdownParser { public parse(jekyllMarkdown: string): { html: string; - yaml: string; parsedYaml: Record; - markdown: string; } { const { yaml, markdown } = this.separate(jekyllMarkdown); const parsedYaml = this.parseYaml(yaml); const html = this.compileMarkdown(markdown); - return { - html, - markdown, - parsedYaml, - yaml - }; + return { html, parsedYaml }; } } diff --git a/shared/list.utils.ts b/shared/list.utils.ts index b1bac4c..f42e1e6 100644 --- a/shared/list.utils.ts +++ b/shared/list.utils.ts @@ -1,4 +1,5 @@ import { EntryBase } from './base.types'; +import { stripHtmlTags } from './html.utils'; /** * Extract the first "big" paragraph from HTML content. @@ -17,7 +18,6 @@ export function extractFirstBigParagraph(html: string): string { return ''; } - const stripHtmlTags = (s: string) => s.replace(/<[^>]*>/g, ''); const bigParagraph = matches.find(m => m && stripHtmlTags(m).length > 100); const paragraph = bigParagraph || matches[0] || ''; return paragraph.replace(/(.*?)<\/a>/g, '$1'); From 52423f4b29833de1a63fbfc671b12ea34b8ac8a7 Mon Sep 17 00:00:00 2001 From: Johannes Hoppe Date: Sat, 7 Feb 2026 20:48:24 +0100 Subject: [PATCH 10/16] feat: preserve HTML formatting in TOC links + add comprehensive tests - TOC links now preserve inline formatting (, , ) - Add warning for duplicate headings (known limitation) - Move gfm-heading-id.ts out of subdirectory - Add html.utils.spec.ts (45 tests) - Add gfm-heading-id.spec.ts (24 tests) - Add 5 TOC formatting tests - Document HeadingData text vs raw separation - Clarify slug terminology in base.types.ts Total: 205 tests passing --- shared/base.types.ts | 1 + shared/gfm-heading-id.spec.ts | 230 ++++++++++++++++++ .../index.ts => gfm-heading-id.ts} | 46 +++- shared/html.utils.spec.ts | 182 ++++++++++++++ shared/jekyll-markdown-parser.spec.ts | 133 ++++++++++ shared/jekyll-markdown-parser.ts | 11 +- 6 files changed, 595 insertions(+), 8 deletions(-) create mode 100644 shared/gfm-heading-id.spec.ts rename shared/{gfm-heading-id/index.ts => gfm-heading-id.ts} (54%) create mode 100644 shared/html.utils.spec.ts diff --git a/shared/base.types.ts b/shared/base.types.ts index e3e2f4e..4819076 100644 --- a/shared/base.types.ts +++ b/shared/base.types.ts @@ -19,6 +19,7 @@ export interface EntryMetaBase { } export interface EntryBase { + /** URL-friendly identifier derived from folder name: "2024-01-my-post" */ slug: string; html: string; meta: EntryMetaBase; diff --git a/shared/gfm-heading-id.spec.ts b/shared/gfm-heading-id.spec.ts new file mode 100644 index 0000000..d2f7fbd --- /dev/null +++ b/shared/gfm-heading-id.spec.ts @@ -0,0 +1,230 @@ +/** + * Tests for gfm-heading-id.ts + * + * Adapted from: https://github.com/markedjs/marked-gfm-heading-id + * Original tests by marked team, MIT license. + * + * Key behaviors tested: + * 1. ID generation with github-slugger + * 2. Heading list collection + * 3. Reset functionality + * 4. text (with HTML) vs raw (plain text) separation + */ + +import { describe, it, expect, beforeEach } from 'vitest'; +import { Marked } from 'marked'; +import { gfmHeadingId, getHeadingList, resetHeadings, HeadingData } from './gfm-heading-id'; + +describe('gfm-heading-id', () => { + let marked: Marked; + + beforeEach(() => { + resetHeadings(); + marked = new Marked(gfmHeadingId()); + }); + + describe('ID generation', () => { + it('should generate lowercase slugified IDs', () => { + marked.parse('# Hello World'); + expect(getHeadingList()[0].id).toBe('hello-world'); + }); + + it('should increment IDs for duplicate headings', () => { + marked.parse('# foo\n\n# foo\n\n# foo'); + const headings = getHeadingList(); + + expect(headings[0].id).toBe('foo'); + expect(headings[1].id).toBe('foo-1'); + expect(headings[2].id).toBe('foo-2'); + }); + + it('should handle heading text that looks like an ID suffix', () => { + // "foo 1" as text should not conflict with "foo-1" as auto-suffix + marked.parse('# foo 1\n\n# foo\n\n# foo'); + const headings = getHeadingList(); + + expect(headings[0].id).toBe('foo-1'); // "foo 1" → "foo-1" + expect(headings[1].id).toBe('foo'); // first "foo" + expect(headings[2].id).toBe('foo-2'); // second "foo" → "foo-2" (not foo-1!) + }); + + it('should support prefix option', () => { + marked = new Marked(gfmHeadingId({ prefix: 'custom-' })); + marked.parse('# Test'); + expect(getHeadingList()[0].id).toBe('custom-test'); + }); + + it('should handle German umlauts (github-slugger behavior)', () => { + marked.parse('# Über uns'); + expect(getHeadingList()[0].id).toBe('über-uns'); + }); + + it('should handle special characters in headings', () => { + marked.parse('# FAQ & Hilfe'); + // github-slugger removes & but keeps surrounding chars + expect(getHeadingList()[0].id).toBe('faq--hilfe'); + }); + }); + + describe('getHeadingList()', () => { + it('should collect all headings with correct levels', () => { + marked.parse('# H1\n## H2\n### H3\n#### H4\n##### H5\n###### H6'); + const headings = getHeadingList(); + + expect(headings).toHaveLength(6); + expect(headings.map(h => h.level)).toEqual([1, 2, 3, 4, 5, 6]); + }); + + it('should return HeadingData with all required properties', () => { + marked.parse('## Test Heading'); + const heading = getHeadingList()[0]; + + expect(heading).toMatchObject({ + level: 2, + text: 'Test Heading', + raw: 'Test Heading', + id: 'test-heading' + }); + }); + + it('should clear list on each new parse (preprocess hook)', () => { + marked.parse('# First'); + expect(getHeadingList()).toHaveLength(1); + + marked.parse('# Second\n## Third'); + expect(getHeadingList()).toHaveLength(2); + expect(getHeadingList()[0].raw).toBe('Second'); + }); + }); + + describe('resetHeadings()', () => { + it('should clear heading list when called manually', () => { + marked.parse('# Test'); + expect(getHeadingList()).toHaveLength(1); + + resetHeadings(); + expect(getHeadingList()).toHaveLength(0); + }); + + it('should reset slugger counter', () => { + marked.parse('# foo\n\n# foo'); + expect(getHeadingList()[1].id).toBe('foo-1'); + + resetHeadings(); + // Slug counter should start fresh + marked.parse('# foo\n\n# foo'); + expect(getHeadingList()[1].id).toBe('foo-1'); // NOT foo-3 + }); + }); + + describe('text vs raw separation (OUR KEY IMPROVEMENT)', () => { + /** + * This is the main value of our fork: + * - text: preserves HTML as rendered by marked (for display) + * - raw: plain text with HTML stripped and entities decoded (for TOC) + */ + + it('should preserve HTML in text but strip from raw', () => { + marked.parse('# Hello **world**'); + const heading = getHeadingList()[0]; + + expect(heading.text).toBe('Hello world'); + expect(heading.raw).toBe('Hello world'); + }); + + it('should strip inline code tags from raw', () => { + marked.parse('# Using `npm install`'); + const heading = getHeadingList()[0]; + + expect(heading.text).toContain('npm install'); + expect(heading.raw).toBe('Using npm install'); + }); + + it('should strip nested HTML tags from raw', () => { + marked.parse('# Hello world!'); + const heading = getHeadingList()[0]; + + expect(heading.text).toContain(''); + expect(heading.text).toContain(''); + expect(heading.raw).toBe('Hello world!'); + }); + + it('should decode & entities in raw', () => { + // When marked renders bold, it may produce entities + marked.parse('# Tom **&** Jerry'); // literal & in markdown + const heading = getHeadingList()[0]; + + // The & should be decoded in raw, not show as & + expect(heading.raw).toBe('Tom & Jerry'); + }); + + it('should decode " entities in raw', () => { + // Test actual entity handling - marked escapes quotes in certain contexts + marked.parse('# Title with **"quotes"**'); + const heading = getHeadingList()[0]; + + expect(heading.raw).toBe('Title with "quotes"'); + }); + + it('should decode ' and ' entities (single quotes)', () => { + // Create a heading where marked produces ' + marked.parse("# It's **fine**"); + const heading = getHeadingList()[0]; + + expect(heading.raw).toBe("It's fine"); + }); + }); + + describe('edge cases', () => { + it('should handle empty heading', () => { + marked.parse('# '); + const heading = getHeadingList()[0]; + + expect(heading.raw).toBe(''); + expect(heading.id).toBe(''); + }); + + it('should handle HTML comment in heading (stripped by marked)', () => { + marked.parse('# visible text'); + const heading = getHeadingList()[0]; + + // marked v17 strips comments entirely, including surrounding whitespace + expect(heading.raw).toBe('visible text'); + }); + + it('should handle raw HTML that looks like a tag (treated as HTML)', () => { + // is valid HTML, so marked treats it as such + marked.parse('# Text with emphasis'); + const heading = getHeadingList()[0]; + + expect(heading.text).toContain(''); + expect(heading.raw).toBe('Text with emphasis'); + }); + + it('should handle invalid HTML-like content', () => { + // { + it('should produce heading with id attribute', () => { + const html = marked.parse('# Test'); + expect(html).toContain('

Test

'); + }); + + it('should include newline after heading', () => { + const html = marked.parse('# Test'); + expect(html).toBe('

Test

\n'); + }); + + it('should preserve inline HTML in output', () => { + const html = marked.parse('# Hello **world**'); + expect(html).toContain('world'); + }); + }); +}); diff --git a/shared/gfm-heading-id/index.ts b/shared/gfm-heading-id.ts similarity index 54% rename from shared/gfm-heading-id/index.ts rename to shared/gfm-heading-id.ts index 310eaf0..cc4bd46 100644 --- a/shared/gfm-heading-id/index.ts +++ b/shared/gfm-heading-id.ts @@ -4,24 +4,56 @@ * Forked from: https://github.com/markedjs/marked-gfm-heading-id (v4.1.3) * Original license: MIT * - * Changes from original: + * ============================================================================= + * WHY WE FORKED + * ============================================================================= + * + * The original package only provides the heading ID. We need more for TOC + * generation: both the formatted text (with HTML) AND the plain text. + * + * HeadingData provides two representations: + * + * text: "Using npm install" ← HTML preserved (for TOC links) + * raw: "Using npm install" ← Plain text (for slug generation) + * + * Example: Markdown heading `## Using \`npm install\`` + * + * 1. marked renders: "Using npm install" + * 2. We store this as `text` (used in TOC link display) + * 3. We strip HTML + decode entities → `raw` (used for ID generation) + * 4. github-slugger creates ID from raw → "using-npm-install" + * + * The TOC then shows formatted links: + * + * Using npm install + * + * Without our fork, TOC links would lose all formatting: + * + * Using npm install ← formatting lost! + * + * ============================================================================= + * CHANGES FROM ORIGINAL + * ============================================================================= + * * - Converted to TypeScript * - Simplified API (removed globalSlugs option - we always reset per document) - * - Improved entity handling (decode before slugging, not after) - * - Added HeadingData export for TOC generation + * - Added `text` field: HTML as rendered by marked (preserves , , etc.) + * - Added `raw` field: Plain text with HTML stripped and entities decoded + * - Slug generation uses `raw` (decoded text, not HTML) */ import GithubSlugger from 'github-slugger'; import type { MarkedExtension, Tokens } from 'marked'; -import { decodeHtmlEntities, stripHtmlTags } from '../html.utils'; +import { decodeHtmlEntities, stripHtmlTags } from './html.utils'; export interface HeadingData { + /** Heading level (1-6) */ level: number; - /** The heading text (may contain HTML entities from marked) */ + /** Formatted text with HTML preserved: "Using npm" */ text: string; - /** The raw heading text (HTML tags stripped, entities decoded) */ + /** Plain text (HTML stripped, entities decoded): "Using npm" */ raw: string; - /** The generated slug ID */ + /** Anchor ID for linking: "using-npm" (generated by github-slugger) */ id: string; } diff --git a/shared/html.utils.spec.ts b/shared/html.utils.spec.ts new file mode 100644 index 0000000..6e01994 --- /dev/null +++ b/shared/html.utils.spec.ts @@ -0,0 +1,182 @@ +import { describe, it, expect } from 'vitest'; +import { stripHtmlTags, decodeHtmlEntities, escapeHtml } from './html.utils'; + +describe('stripHtmlTags', () => { + it('should return empty string for empty input', () => { + expect(stripHtmlTags('')).toBe(''); + }); + + it('should return text unchanged when no HTML tags present', () => { + expect(stripHtmlTags('Hello World')).toBe('Hello World'); + }); + + it('should strip simple HTML tags', () => { + expect(stripHtmlTags('

Hello

')).toBe('Hello'); + }); + + it('should strip tags with attributes', () => { + expect(stripHtmlTags('Link')).toBe('Link'); + }); + + it('should strip multiple tags', () => { + expect(stripHtmlTags('

Hello

World
')).toBe('HelloWorld'); + }); + + it('should strip self-closing tags', () => { + expect(stripHtmlTags('Before
After')).toBe('BeforeAfter'); + expect(stripHtmlTags('Before
After')).toBe('BeforeAfter'); + }); + + it('should strip img tags', () => { + expect(stripHtmlTags('Test')).toBe(''); + }); + + it('should preserve text between tags', () => { + expect(stripHtmlTags('Bold and italic')).toBe('Bold and italic'); + }); + + it('should handle nested tags', () => { + expect(stripHtmlTags('

Deep

')).toBe('Deep'); + }); + + it('should handle tags with multiple attributes', () => { + expect(stripHtmlTags('')).toBe(''); + }); + + it('should preserve whitespace between tags', () => { + expect(stripHtmlTags('

Hello

World

')).toBe('Hello World'); + }); + + it('should handle HTML comments by stripping them', () => { + expect(stripHtmlTags('BeforeAfter')).toBe('BeforeAfter'); + }); +}); + +describe('decodeHtmlEntities', () => { + it('should return empty string for empty input', () => { + expect(decodeHtmlEntities('')).toBe(''); + }); + + it('should return text unchanged when no entities present', () => { + expect(decodeHtmlEntities('Hello World')).toBe('Hello World'); + }); + + it('should decode & to &', () => { + expect(decodeHtmlEntities('Tom & Jerry')).toBe('Tom & Jerry'); + }); + + it('should decode < to <', () => { + expect(decodeHtmlEntities('a < b')).toBe('a < b'); + }); + + it('should decode > to >', () => { + expect(decodeHtmlEntities('a > b')).toBe('a > b'); + }); + + it('should decode " to "', () => { + expect(decodeHtmlEntities('He said "hello"')).toBe('He said "hello"'); + }); + + it('should decode ' to single quote', () => { + expect(decodeHtmlEntities("It's fine")).toBe("It's fine"); + }); + + it('should decode ' to single quote', () => { + expect(decodeHtmlEntities("It's fine")).toBe("It's fine"); + }); + + it('should decode / to /', () => { + expect(decodeHtmlEntities('path/to/file')).toBe('path/to/file'); + }); + + it('should decode multiple entities in one string', () => { + expect(decodeHtmlEntities('<div class="test">')).toBe('
'); + }); + + it('should handle entities at start and end', () => { + expect(decodeHtmlEntities('&start and end&')).toBe('&start and end&'); + }); + + it('should handle multiple consecutive same entities', () => { + expect(decodeHtmlEntities('&&&')).toBe('&&&'); + }); + + it('should decode Array pattern (common in code)', () => { + expect(decodeHtmlEntities('Array<string>')).toBe('Array'); + }); + + it('should decode generic TypeScript code pattern', () => { + expect(decodeHtmlEntities('Map<string, number>')).toBe('Map'); + }); +}); + +describe('escapeHtml', () => { + it('should return empty string for empty input', () => { + expect(escapeHtml('')).toBe(''); + }); + + it('should return text unchanged when no special chars present', () => { + expect(escapeHtml('Hello World')).toBe('Hello World'); + }); + + it('should escape & to &', () => { + expect(escapeHtml('Tom & Jerry')).toBe('Tom & Jerry'); + }); + + it('should escape " to "', () => { + expect(escapeHtml('He said "hello"')).toBe('He said "hello"'); + }); + + it('should escape < to <', () => { + expect(escapeHtml('a < b')).toBe('a < b'); + }); + + it('should escape > to >', () => { + expect(escapeHtml('a > b')).toBe('a > b'); + }); + + it('should escape all special characters in one string', () => { + expect(escapeHtml('
&
')).toBe('<div class="test">&</div>'); + }); + + it('should handle multiple ampersands correctly', () => { + // Ampersands must be escaped first to avoid double-escaping + expect(escapeHtml('a & b & c')).toBe('a & b & c'); + }); + + it('should escape HTML tag patterns', () => { + expect(escapeHtml('')).toBe('<script>alert("xss")</script>'); + }); + + it('should escape TypeScript generic syntax', () => { + expect(escapeHtml('Array')).toBe('Array<string>'); + }); + + it('should be reversible with decodeHtmlEntities', () => { + const original = 'Tom & Jerry <3 "quotes"'; + const escaped = escapeHtml(original); + const decoded = decodeHtmlEntities(escaped); + expect(decoded).toBe(original); + }); +}); + +describe('escapeHtml and decodeHtmlEntities roundtrip', () => { + const testCases = [ + 'Simple text', + 'Tom & Jerry', + 'a < b > c', + 'He said "hello"', + "It's fine", + '
Content & more
', + 'Array>', + '& already encoded', + ]; + + testCases.forEach((input) => { + it(`should roundtrip: ${input.substring(0, 30)}...`, () => { + const escaped = escapeHtml(input); + const decoded = decodeHtmlEntities(escaped); + expect(decoded).toBe(input); + }); + }); +}); diff --git a/shared/jekyll-markdown-parser.spec.ts b/shared/jekyll-markdown-parser.spec.ts index 9fb9ab1..a178e90 100644 --- a/shared/jekyll-markdown-parser.spec.ts +++ b/shared/jekyll-markdown-parser.spec.ts @@ -1491,6 +1491,139 @@ Just text, no more headings. // TOC area should be essentially empty (just the Inhalt heading) expect(result.html).toContain('

Inhalt

'); }); + + it('should preserve inline code formatting in TOC links', () => { + const input = `--- +title: Test +--- + +## Inhalt + +${TOC_MARKER} + +## Using \`npm install\` + +Text. + +## The \`async\` Keyword + +More text. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + // TOC links should contain tags (rendered from markdown) + expect(result.html).toContain('npm install'); + expect(result.html).toContain('async'); + // The actual headings should also have code formatting + expect(result.html).toContain('

Using npm install

'); + }); + + it('should preserve bold and italic formatting in TOC links', () => { + const input = `--- +title: Test +--- + +## Inhalt + +${TOC_MARKER} + +## This is **important** + +Text. + +## Use *caution* here + +More text. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + // TOC links should contain formatting tags + expect(result.html).toContain('important'); + expect(result.html).toContain('caution here'); + }); + + it('should preserve mixed formatting in TOC links', () => { + const input = `--- +title: Test +--- + +## Inhalt + +${TOC_MARKER} + +## Using \`rxResource\` with **Signals** + +Complex example. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + // Should have both code and bold formatting + expect(result.html).toContain('rxResource'); + expect(result.html).toContain('Signals'); + // Verify the complete link structure + expect(result.html).toContain('Using rxResource with Signals'); + }); + + it('should handle headings with only code (no plain text)', () => { + const input = `--- +title: Test +--- + +## Inhalt + +${TOC_MARKER} + +## \`package.json\` + +Config file. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + // The entire heading is code + expect(result.html).toContain('package.json'); + expect(result.html).toContain('id="packagejson"'); + }); + + it('should warn about duplicate headings (known limitation)', () => { + // KNOWN LIMITATION: If the same heading text appears multiple times, + // TOC links may not work correctly due to ID suffix mismatch. + // We warn about this but don't fix it (very rare edge case). + const input = `--- +title: Test +--- + +## Inhalt + +${TOC_MARKER} + +## Fazit + +Text. + +## Fazit + +End. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + + // Capture console.warn + const warnings: string[] = []; + const originalWarn = console.warn; + console.warn = (msg: string) => warnings.push(msg); + + parser.parse(input); + + console.warn = originalWarn; + + // Should warn about duplicate heading + expect(warnings.length).toBe(1); + expect(warnings[0]).toContain('Duplicate heading'); + expect(warnings[0]).toContain('Fazit'); + }); }); }); }); diff --git a/shared/jekyll-markdown-parser.ts b/shared/jekyll-markdown-parser.ts index 93b2529..1602405 100644 --- a/shared/jekyll-markdown-parser.ts +++ b/shared/jekyll-markdown-parser.ts @@ -195,11 +195,20 @@ export class JekyllMarkdownParser { return ''; } + // Warn about duplicate headings (would cause ID mismatch if also before marker) + const seenRaw = new Set(); + for (const h of relevantHeadings) { + if (seenRaw.has(h.raw)) { + console.warn(`WARNING: Duplicate heading "${h.raw}" - TOC links may not work correctly`); + } + seenRaw.add(h.raw); + } + // Generate markdown list return relevantHeadings .map(h => { const indent = h.level === 3 ? ' ' : ''; - return `${indent}* [${h.raw}](#${h.id})`; + return `${indent}* [${h.text}](#${h.id})`; }) .join('\n'); } From 3e88eeb9bece9d4d0a11eb32de653f64ad831b9d Mon Sep 17 00:00:00 2001 From: Johannes Hoppe Date: Sat, 7 Feb 2026 20:58:39 +0100 Subject: [PATCH 11/16] feat: add anchor link validation with fuzzy "did you mean?" suggestions Build-time validation for broken anchor links: - Collects all heading IDs during parsing - Extracts all anchor links from HTML - Validates links point to existing anchors - Suggests similar anchors using Levenshtein distance (typo detection) - Warnings only, does not fail the build New files: - string.utils.ts: Levenshtein distance + findSimilar (27 tests) - link-validator.ts: Anchor validation logic (14 tests) Changes: - jekyll-markdown-parser.ts: Now returns headingIds - base.utils.ts: Registers anchors and links during parsing - build.ts: Runs validation at end of build Total: 246 tests passing --- build.ts | 7 +- shared/base.utils.ts | 11 +- shared/jekyll-markdown-parser.ts | 11 +- shared/link-validator.spec.ts | 174 ++++++++++++++++++++++++++++ shared/link-validator.ts | 162 ++++++++++++++++++++++++++ shared/string.utils.spec.ts | 193 +++++++++++++++++++++++++++++++ shared/string.utils.ts | 113 ++++++++++++++++++ 7 files changed, 663 insertions(+), 8 deletions(-) create mode 100644 shared/link-validator.spec.ts create mode 100644 shared/link-validator.ts create mode 100644 shared/string.utils.spec.ts create mode 100644 shared/string.utils.ts diff --git a/build.ts b/build.ts index 53b257a..a3fe2c8 100644 --- a/build.ts +++ b/build.ts @@ -8,6 +8,7 @@ import { copyEntriesToDist, getEntryList } from './shared/base.utils'; import { makeLightBlogList } from './blog/blog.utils'; import { makeLightList } from './shared/list.utils'; import { MARKDOWN_BASE_URL_PLACEHOLDER } from './shared/jekyll-markdown-parser'; +import { printValidationResults } from './shared/link-validator'; const DIST_FOLDER = '../dist'; const BLOG_FOLDER = '../blog'; @@ -65,7 +66,11 @@ async function build(): Promise { await buildBlog(); await buildMaterial(); - console.log('Build complete!'); + // Validate all anchor links (warnings only, does not fail build) + console.log('\nValidating anchor links...'); + printValidationResults(); + + console.log('\nBuild complete!'); } build().catch((error) => { diff --git a/shared/base.utils.ts b/shared/base.utils.ts index 5ed1017..23e3b63 100644 --- a/shared/base.utils.ts +++ b/shared/base.utils.ts @@ -6,6 +6,7 @@ import { copy, remove, writeJson, mkdirp } from 'fs-extra'; import { JekyllMarkdownParser } from './jekyll-markdown-parser'; import { EntryBase, ImageDimensions } from './base.types'; +import { registerAnchors, registerLinks } from './link-validator'; const README_FILE = 'README.md'; const ENTRY_FILE = 'entry.json'; @@ -81,9 +82,13 @@ export async function markdownToEntry( ): Promise { const imageBaseUrl = baseUrl + folder + '/'; const parser = new JekyllMarkdownParser(imageBaseUrl, linkBasePath); - const parsedJekyllMarkdown = parser.parse(markdown); + const { html, parsedYaml, headingIds } = parser.parse(markdown); - const meta: Record = parsedJekyllMarkdown.parsedYaml; + // Register anchors and links for validation + registerAnchors(linkBasePath, headingIds); + registerLinks(linkBasePath, html); + + const meta: Record = parsedYaml; // Convert Date objects from js-yaml to ISO strings // js-yaml parses unquoted dates (e.g., `published: 2024-01-15`) as Date objects @@ -105,7 +110,7 @@ export async function markdownToEntry( // Type assertion: we trust that YAML contains all required properties for T return { slug: folder, - html: emoji.emojify(parsedJekyllMarkdown.html), + html: emoji.emojify(html), meta } as unknown as T; } diff --git a/shared/jekyll-markdown-parser.ts b/shared/jekyll-markdown-parser.ts index 1602405..f9cd536 100644 --- a/shared/jekyll-markdown-parser.ts +++ b/shared/jekyll-markdown-parser.ts @@ -300,7 +300,7 @@ export class JekyllMarkdownParser { return { markdown, yaml }; } - private compileMarkdown(markdown: string): string { + private compileMarkdown(markdown: string): { html: string; headingIds: string[] } { // Generate TOC if marker is present // Note: This parses twice when TOC is present - once to collect headings, once for final HTML. // This is intentional: we need heading data before generating TOC, but TOC must be in the @@ -314,8 +314,10 @@ export class JekyllMarkdownParser { // Reset headings for clean state (generateToc may have populated them) resetHeadings(); const html = this.marked.parse(processedMarkdown) as string; + const headingIds = getHeadingList().map(h => h.id); const withImages = this.transformRelativeImagePaths(html); - return this.transformRelativeLinks(withImages); + const finalHtml = this.transformRelativeLinks(withImages); + return { html: finalHtml, headingIds }; } private parseYaml(yaml: string): Record { @@ -329,11 +331,12 @@ export class JekyllMarkdownParser { public parse(jekyllMarkdown: string): { html: string; parsedYaml: Record; + headingIds: string[]; } { const { yaml, markdown } = this.separate(jekyllMarkdown); const parsedYaml = this.parseYaml(yaml); - const html = this.compileMarkdown(markdown); + const { html, headingIds } = this.compileMarkdown(markdown); - return { html, parsedYaml }; + return { html, parsedYaml, headingIds }; } } diff --git a/shared/link-validator.spec.ts b/shared/link-validator.spec.ts new file mode 100644 index 0000000..a0102d9 --- /dev/null +++ b/shared/link-validator.spec.ts @@ -0,0 +1,174 @@ +import { describe, it, expect, beforeEach } from 'vitest'; +import { + registerAnchors, + registerLinks, + validateLinks, + resetValidator, + getAnchors, + getLinks +} from './link-validator'; + +describe('link-validator', () => { + beforeEach(() => { + resetValidator(); + }); + + describe('registerAnchors', () => { + it('should register anchors for a path', () => { + registerAnchors('/blog/my-post', ['intro', 'fazit']); + + const anchors = getAnchors('/blog/my-post'); + expect(anchors).toBeDefined(); + expect(anchors!.has('intro')).toBe(true); + expect(anchors!.has('fazit')).toBe(true); + }); + + it('should accumulate anchors for same path', () => { + registerAnchors('/blog/my-post', ['intro']); + registerAnchors('/blog/my-post', ['fazit']); + + const anchors = getAnchors('/blog/my-post'); + expect(anchors!.size).toBe(2); + }); + + it('should keep anchors separate per path', () => { + registerAnchors('/blog/post-1', ['intro']); + registerAnchors('/blog/post-2', ['fazit']); + + expect(getAnchors('/blog/post-1')!.has('intro')).toBe(true); + expect(getAnchors('/blog/post-1')!.has('fazit')).toBe(false); + expect(getAnchors('/blog/post-2')!.has('fazit')).toBe(true); + }); + }); + + describe('registerLinks', () => { + it('should extract anchor links from HTML', () => { + const html = 'Link'; + registerLinks('/blog/my-post', html); + + const links = getLinks(); + expect(links).toHaveLength(1); + expect(links[0]).toEqual({ + fromPath: '/blog/my-post', + toPath: '/blog/other', + anchor: 'section', + fullLink: '/blog/other#section' + }); + }); + + it('should handle same-document anchors', () => { + const html = 'Link'; + registerLinks('/blog/my-post', html); + + const links = getLinks(); + expect(links[0].toPath).toBe('/blog/my-post'); + expect(links[0].anchor).toBe('local-section'); + }); + + it('should extract multiple links', () => { + const html = ` + One + Two + Three + `; + registerLinks('/blog/my-post', html); + + expect(getLinks()).toHaveLength(3); + }); + + it('should ignore links without anchors', () => { + const html = 'No anchor'; + registerLinks('/blog/my-post', html); + + expect(getLinks()).toHaveLength(0); + }); + + it('should handle both quote styles', () => { + const html = ` + Double + Single + `; + registerLinks('/blog/my-post', html); + + expect(getLinks()).toHaveLength(2); + }); + }); + + describe('validateLinks', () => { + it('should return valid for matching links', () => { + registerAnchors('/blog/post-1', ['intro', 'fazit']); + registerAnchors('/blog/post-2', ['overview']); + + const html = ` + Intro + Overview + `; + registerLinks('/blog/post-1', html); + + const result = validateLinks(); + expect(result.valid).toBe(true); + expect(result.brokenLinks).toHaveLength(0); + }); + + it('should detect broken anchor in existing path', () => { + registerAnchors('/blog/post-1', ['intro']); + + const html = 'Broken'; + registerLinks('/blog/my-post', html); + + const result = validateLinks(); + expect(result.valid).toBe(false); + expect(result.brokenLinks).toHaveLength(1); + expect(result.brokenLinks[0].anchor).toBe('nonexistent'); + }); + + it('should detect link to nonexistent path', () => { + registerAnchors('/blog/post-1', ['intro']); + + const html = 'Broken'; + registerLinks('/blog/my-post', html); + + const result = validateLinks(); + expect(result.valid).toBe(false); + expect(result.brokenLinks).toHaveLength(1); + expect(result.brokenLinks[0].toPath).toBe('/blog/nonexistent'); + }); + + it('should validate same-document links', () => { + registerAnchors('/blog/my-post', ['existing']); + + const html = ` + Valid + Broken + `; + registerLinks('/blog/my-post', html); + + const result = validateLinks(); + expect(result.valid).toBe(false); + expect(result.brokenLinks).toHaveLength(1); + expect(result.brokenLinks[0].anchor).toBe('missing'); + }); + + it('should count total links', () => { + registerAnchors('/blog/post', ['a', 'b']); + + const html = 'ABC'; + registerLinks('/blog/post', html); + + const result = validateLinks(); + expect(result.totalLinks).toBe(3); + }); + }); + + describe('resetValidator', () => { + it('should clear all data', () => { + registerAnchors('/blog/post', ['intro']); + registerLinks('/blog/post', 'Link'); + + resetValidator(); + + expect(getAnchors('/blog/post')).toBeUndefined(); + expect(getLinks()).toHaveLength(0); + }); + }); +}); diff --git a/shared/link-validator.ts b/shared/link-validator.ts new file mode 100644 index 0000000..2197426 --- /dev/null +++ b/shared/link-validator.ts @@ -0,0 +1,162 @@ +/** + * Anchor Link Validator + * + * Validates that all internal anchor links point to existing headings. + * Runs after all entries are parsed to catch broken links at build time. + * + * Features: + * - Detects broken anchor links (missing target or missing anchor) + * - Suggests similar anchors using Levenshtein distance (typo detection) + * - Non-blocking: only warns, does not fail the build + * + * Usage: + * 1. registerAnchors(path, headingIds) - call after parsing each entry + * 2. registerLinks(path, html) - extracts and registers all anchor links + * 3. validateLinks() - call at end of build to check for broken links + */ + +import { findSimilar } from './string.utils'; + +/** Registry of all anchors per entry path */ +const anchorRegistry = new Map>(); + +/** Registry of all anchor links found: { fromPath, toPath, anchor } */ +interface AnchorLink { + fromPath: string; + toPath: string; + anchor: string; + fullLink: string; +} +const linkRegistry: AnchorLink[] = []; + +// Regex to find href attributes with anchors: href="/blog/slug#anchor" or href="#anchor" +const ANCHOR_LINK_REGEX = /]*\shref=(["'])([^"']*#[^"']+)\1/g; + +/** + * Register heading anchors for an entry. + * @param entryPath - Absolute path like "/blog/my-post" + * @param headingIds - Array of heading IDs like ["intro", "fazit"] + */ +export function registerAnchors(entryPath: string, headingIds: string[]): void { + const existing = anchorRegistry.get(entryPath) ?? new Set(); + for (const id of headingIds) { + existing.add(id); + } + anchorRegistry.set(entryPath, existing); +} + +/** + * Extract anchor links from HTML and register them. + * @param fromPath - Entry path where links were found + * @param html - HTML content to scan for links + */ +export function registerLinks(fromPath: string, html: string): void { + let match; + while ((match = ANCHOR_LINK_REGEX.exec(html)) !== null) { + const fullLink = match[2]; + + // Parse the link: "/blog/other#section" or "#section" + const hashIndex = fullLink.indexOf('#'); + if (hashIndex === -1) continue; + + const pathPart = fullLink.substring(0, hashIndex); + const anchor = fullLink.substring(hashIndex + 1); + + // Determine target path + const toPath = pathPart || fromPath; // Empty path = same document + + linkRegistry.push({ + fromPath, + toPath, + anchor, + fullLink + }); + } +} + +/** + * Validate all registered links against registered anchors. + * @returns Object with broken links and stats + */ +export function validateLinks(): { + valid: boolean; + totalLinks: number; + brokenLinks: AnchorLink[]; +} { + const brokenLinks: AnchorLink[] = []; + + for (const link of linkRegistry) { + const targetAnchors = anchorRegistry.get(link.toPath); + + if (!targetAnchors) { + // Target entry doesn't exist + brokenLinks.push(link); + } else if (!targetAnchors.has(link.anchor)) { + // Anchor doesn't exist in target entry + brokenLinks.push(link); + } + } + + return { + valid: brokenLinks.length === 0, + totalLinks: linkRegistry.length, + brokenLinks + }; +} + +/** + * Print validation results to console. + * @returns true if all links are valid, false if there are broken links + */ +export function printValidationResults(): boolean { + const { valid, totalLinks, brokenLinks } = validateLinks(); + + if (valid) { + console.log(`✓ All ${totalLinks} anchor links are valid`); + return true; + } + + console.warn(`\n⚠️ Found ${brokenLinks.length} broken anchor link(s):\n`); + for (const link of brokenLinks) { + console.warn(` ${link.fromPath}`); + console.warn(` → ${link.fullLink}`); + + // Provide helpful context + const targetAnchors = anchorRegistry.get(link.toPath); + if (!targetAnchors) { + console.warn(` ✗ Target path "${link.toPath}" does not exist`); + } else { + console.warn(` ✗ Anchor "#${link.anchor}" not found`); + // Suggest similar anchors using fuzzy matching (Levenshtein distance ≤ 3) + const similar = findSimilar(link.anchor, [...targetAnchors], 3); + if (similar.length > 0) { + console.warn(` ? Did you mean: ${similar.slice(0, 3).map(a => '#' + a).join(', ')}`); + } + } + console.warn(''); + } + + return false; +} + +/** + * Reset the validator (for testing). + */ +export function resetValidator(): void { + anchorRegistry.clear(); + linkRegistry.length = 0; +} + +/** + * Get registered anchors for a path (for testing). + */ +export function getAnchors(entryPath: string): Set | undefined { + return anchorRegistry.get(entryPath); +} + +/** + * Get all registered links (for testing). + */ +export function getLinks(): AnchorLink[] { + return [...linkRegistry]; +} diff --git a/shared/string.utils.spec.ts b/shared/string.utils.spec.ts new file mode 100644 index 0000000..c95a451 --- /dev/null +++ b/shared/string.utils.spec.ts @@ -0,0 +1,193 @@ +import { describe, it, expect } from 'vitest'; +import { levenshtein, findSimilar } from './string.utils'; + +describe('levenshtein', () => { + describe('identical strings', () => { + it('should return 0 for empty strings', () => { + expect(levenshtein('', '')).toBe(0); + }); + + it('should return 0 for identical strings', () => { + expect(levenshtein('hello', 'hello')).toBe(0); + expect(levenshtein('introduction', 'introduction')).toBe(0); + }); + }); + + describe('empty string cases', () => { + it('should return length of other string when one is empty', () => { + expect(levenshtein('', 'abc')).toBe(3); + expect(levenshtein('hello', '')).toBe(5); + }); + }); + + describe('single character edits', () => { + it('should detect single insertion', () => { + expect(levenshtein('ac', 'abc')).toBe(1); + expect(levenshtein('hell', 'hello')).toBe(1); + }); + + it('should detect single deletion', () => { + expect(levenshtein('abc', 'ac')).toBe(1); + expect(levenshtein('hello', 'helo')).toBe(1); + }); + + it('should detect single substitution', () => { + expect(levenshtein('abc', 'adc')).toBe(1); + expect(levenshtein('cat', 'bat')).toBe(1); + }); + }); + + describe('multiple edits', () => { + it('should count transposition as 2 edits', () => { + // Levenshtein counts transposition as 2 edits (delete + insert) + expect(levenshtein('ab', 'ba')).toBe(2); + expect(levenshtein('intro', 'intor')).toBe(2); + }); + + it('should handle classic example: kitten → sitting', () => { + // kitten → sitten (substitute k→s) + // sitten → sittin (substitute e→i) + // sittin → sitting (insert g) + expect(levenshtein('kitten', 'sitting')).toBe(3); + }); + + it('should handle complete replacement', () => { + expect(levenshtein('abc', 'xyz')).toBe(3); + }); + }); + + describe('real-world anchor examples', () => { + it('should detect typo: fazti → fazit', () => { + expect(levenshtein('fazti', 'fazit')).toBe(2); + }); + + it('should detect missing letter: instalation → installation', () => { + expect(levenshtein('instalation', 'installation')).toBe(1); + }); + + it('should detect extra letter: intrroduction → introduction', () => { + expect(levenshtein('intrroduction', 'introduction')).toBe(1); + }); + + it('should detect wrong letter: getting-startet → getting-started', () => { + expect(levenshtein('getting-startet', 'getting-started')).toBe(1); + }); + + it('should handle German umlauts', () => { + expect(levenshtein('über-uns', 'uber-uns')).toBe(1); + expect(levenshtein('übersicht', 'übersicht')).toBe(0); + }); + }); + + describe('symmetry', () => { + it('should be symmetric: d(a,b) = d(b,a)', () => { + expect(levenshtein('abc', 'def')).toBe(levenshtein('def', 'abc')); + expect(levenshtein('hello', 'hallo')).toBe(levenshtein('hallo', 'hello')); + expect(levenshtein('short', 'muchlonger')).toBe(levenshtein('muchlonger', 'short')); + }); + }); + + describe('triangle inequality', () => { + it('should satisfy: d(a,c) ≤ d(a,b) + d(b,c)', () => { + const a = 'abc'; + const b = 'abd'; + const c = 'acd'; + const dAB = levenshtein(a, b); + const dBC = levenshtein(b, c); + const dAC = levenshtein(a, c); + expect(dAC).toBeLessThanOrEqual(dAB + dBC); + }); + }); +}); + +describe('findSimilar', () => { + const candidates = [ + 'introduction', + 'getting-started', + 'installation', + 'configuration', + 'conclusion', + 'fazit', + 'über-uns' + ]; + + describe('typo detection', () => { + it('should find similar for typo: intrduction → introduction', () => { + const result = findSimilar('intrduction', candidates); + expect(result).toContain('introduction'); + }); + + it('should find similar for typo: instalation → installation', () => { + const result = findSimilar('instalation', candidates); + expect(result).toContain('installation'); + }); + + it('should find similar for typo: fazti → fazit', () => { + const result = findSimilar('fazti', candidates); + expect(result).toContain('fazit'); + }); + }); + + describe('sorting by distance', () => { + it('should return results sorted by distance (most similar first)', () => { + // 'intro' has distance 7 to 'introduction' and higher to others + const testCandidates = ['abc', 'ab', 'abcd', 'abcde']; + const result = findSimilar('abc', testCandidates, 5); + + // ab=1, abcd=1, abcde=2 (abc is exact match, excluded) + expect(result[0]).toBe('ab'); + // ab and abcd both have distance 1, order may vary + expect(result).toContain('abcd'); + }); + }); + + describe('maxDistance threshold', () => { + it('should respect maxDistance parameter', () => { + const result = findSimilar('xyz', candidates, 2); + // All candidates are far from 'xyz', none within distance 2 + expect(result).toHaveLength(0); + }); + + it('should include matches at exactly maxDistance', () => { + // 'fazit' → 'fazti' has distance 2 + const result = findSimilar('fazti', ['fazit'], 2); + expect(result).toContain('fazit'); + }); + + it('should exclude matches beyond maxDistance', () => { + const result = findSimilar('fazti', ['fazit'], 1); + expect(result).not.toContain('fazit'); + }); + }); + + describe('exact matches', () => { + it('should not include exact matches (not useful as suggestions)', () => { + const result = findSimilar('fazit', candidates); + expect(result).not.toContain('fazit'); + }); + }); + + describe('empty cases', () => { + it('should return empty array for empty candidates', () => { + const result = findSimilar('test', []); + expect(result).toHaveLength(0); + }); + + it('should return empty array when nothing is similar', () => { + const result = findSimilar('xyzabc123', candidates, 3); + expect(result).toHaveLength(0); + }); + }); + + describe('default maxDistance', () => { + it('should use default maxDistance of 3', () => { + // 'intro' to 'fazit' is distance 5, should not match with default + const result = findSimilar('intro', ['fazit']); + expect(result).toHaveLength(0); + + // 'fazi' to 'fazit' is distance 1, should match + const result2 = findSimilar('fazi', ['fazit']); + expect(result2).toContain('fazit'); + }); + }); +}); diff --git a/shared/string.utils.ts b/shared/string.utils.ts new file mode 100644 index 0000000..814da1f --- /dev/null +++ b/shared/string.utils.ts @@ -0,0 +1,113 @@ +/** + * String utility functions. + */ + +/** + * Calculate the Levenshtein distance between two strings. + * + * The Levenshtein distance is the minimum number of single-character edits + * (insertions, deletions, or substitutions) required to transform one string + * into another. + * + * @example + * ```typescript + * levenshtein('kitten', 'sitting'); // 3 (k→s, e→i, +g) + * levenshtein('intro', 'intor'); // 2 (transposition = 2 edits) + * levenshtein('hello', 'hello'); // 0 (identical) + * levenshtein('', 'abc'); // 3 (3 insertions) + * ``` + * + * Time complexity: O(m × n) where m = a.length, n = b.length + * Space complexity: O(min(m, n)) using single-row optimization + * + * @param a - First string + * @param b - Second string + * @returns The edit distance (0 = identical, higher = more different) + */ +export function levenshtein(a: string, b: string): number { + // Ensure a is the shorter string for space optimization + if (a.length > b.length) { + [a, b] = [b, a]; + } + + const m = a.length; + const n = b.length; + + // Edge cases + if (m === 0) return n; + if (n === 0) return m; + + // Single-row DP: dp[i] = distance for a[0..i-1] vs b[0..j-1] + // Initialize with distances for empty b (all insertions) + const dp: number[] = Array.from({ length: m + 1 }, (_, i) => i); + + for (let j = 1; j <= n; j++) { + let prev = dp[0]; // dp[i-1][j-1] from previous iteration + dp[0] = j; // Distance for empty a vs b[0..j-1] + + for (let i = 1; i <= m; i++) { + const temp = dp[i]; + + if (a[i - 1] === b[j - 1]) { + // Characters match: no edit needed + dp[i] = prev; + } else { + // Minimum of: substitute, delete, insert + dp[i] = 1 + Math.min( + prev, // substitute a[i-1] with b[j-1] + dp[i], // delete a[i-1] + dp[i - 1] // insert b[j-1] + ); + } + + prev = temp; + } + } + + return dp[m]; +} + +/** + * Find strings similar to a query using Levenshtein distance. + * + * Returns candidates sorted by similarity (most similar first). + * Only includes candidates within the maximum distance threshold. + * + * @example + * ```typescript + * const headings = ['introduction', 'getting-started', 'conclusion']; + * findSimilar('intrduction', headings, 3); + * // Returns: ['introduction'] (distance 1) + * + * findSimilar('start', headings, 10); + * // Returns: ['getting-started'] (distance 9, but "start" is substring) + * ``` + * + * @param query - The string to find matches for + * @param candidates - Array of strings to search in + * @param maxDistance - Maximum edit distance to consider (default: 3) + * @returns Array of similar strings, sorted by distance (ascending) + */ +export function findSimilar( + query: string, + candidates: string[], + maxDistance: number = 3 +): string[] { + const matches: Array<{ candidate: string; distance: number }> = []; + + for (const candidate of candidates) { + // Skip exact matches (not useful as suggestions) + if (candidate === query) continue; + + const distance = levenshtein(query, candidate); + + if (distance <= maxDistance) { + matches.push({ candidate, distance }); + } + } + + // Sort by distance (most similar first) + matches.sort((a, b) => a.distance - b.distance); + + return matches.map(m => m.candidate); +} From d7d656e826fa9f386e771160b6b29e209147b9b5 Mon Sep 17 00:00:00 2001 From: Ferdinand Malcher Date: Sun, 8 Feb 2026 10:53:09 +0100 Subject: [PATCH 12/16] fix(README): indentation --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 299479e..56f9150 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,9 @@ Used as a Git submodule in: ```bash npm install -npm run build # Single build -npm run watch # Watch mode for development -npm test # Run tests +npm run build # Single build +npm run watch # Watch mode for development +npm test # Run tests npm run typecheck # TypeScript check ``` From 289997aef71ca86fee264de0616ab09b6041e96c Mon Sep 17 00:00:00 2001 From: Johannes Hoppe Date: Sun, 8 Feb 2026 11:11:22 +0100 Subject: [PATCH 13/16] refactor: improve code quality based on audit findings - link-validator: use matchAll() instead of exec() loop with global regex - html.utils: add single quote escaping for complete HTML safety - base.utils: replace string concat sort key with proper multi-field comparison --- shared/base.utils.ts | 20 +++++++++++++++----- shared/html.utils.spec.ts | 4 ++++ shared/html.utils.ts | 2 ++ shared/link-validator.ts | 15 +++++++++++---- 4 files changed, 32 insertions(+), 9 deletions(-) diff --git a/shared/base.utils.ts b/shared/base.utils.ts index 23e3b63..6646674 100644 --- a/shared/base.utils.ts +++ b/shared/base.utils.ts @@ -54,10 +54,20 @@ export async function copyEntriesToDist( } } -/** Simple way to sort things: create a sort key that can be easily sorted */ -function getSortKey(entry: EntryBase): string { - // ISO 8601 strings sort correctly in lexicographic order - return (entry.meta.sticky ? 'Z' : 'A') + '---' + entry.meta.published + '---' + entry.slug; +/** + * Compare two entries for sorting (newest first, sticky on top). + * @returns negative if a comes first, positive if b comes first + */ +function compareEntries(a: EntryBase, b: EntryBase): number { + // 1. Sticky entries first + if (a.meta.sticky !== b.meta.sticky) { + return a.meta.sticky ? -1 : 1; + } + // 2. Then by date (newest first) - ISO 8601 strings sort lexicographically + const dateCompare = b.meta.published.localeCompare(a.meta.published); + if (dateCompare !== 0) return dateCompare; + // 3. Slug as tiebreaker (descending) + return b.slug.localeCompare(a.slug); } @@ -131,5 +141,5 @@ export async function getEntryList(entriesFolder: string, m entries.push(entry); } - return entries.sort((a, b) => getSortKey(b).localeCompare(getSortKey(a))); + return entries.sort(compareEntries); } diff --git a/shared/html.utils.spec.ts b/shared/html.utils.spec.ts index 6e01994..fc590cb 100644 --- a/shared/html.utils.spec.ts +++ b/shared/html.utils.spec.ts @@ -135,6 +135,10 @@ describe('escapeHtml', () => { expect(escapeHtml('a > b')).toBe('a > b'); }); + it("should escape ' to '", () => { + expect(escapeHtml("It's fine")).toBe("It's fine"); + }); + it('should escape all special characters in one string', () => { expect(escapeHtml('
&
')).toBe('<div class="test">&</div>'); }); diff --git a/shared/html.utils.ts b/shared/html.utils.ts index 213aeef..8aeb971 100644 --- a/shared/html.utils.ts +++ b/shared/html.utils.ts @@ -25,11 +25,13 @@ export function decodeHtmlEntities(html: string): string { /** * Escape special HTML characters for use in attribute values. + * Escapes: & " ' < > */ export function escapeHtml(text: string): string { return text .replace(/&/g, '&') .replace(/"/g, '"') + .replace(/'/g, ''') .replace(//g, '>'); } diff --git a/shared/link-validator.ts b/shared/link-validator.ts index 2197426..31c3bee 100644 --- a/shared/link-validator.ts +++ b/shared/link-validator.ts @@ -32,6 +32,16 @@ const linkRegistry: AnchorLink[] = []; // Regex to find href attributes with anchors: href="/blog/slug#anchor" or href="#anchor" const ANCHOR_LINK_REGEX = /]*\shref=(["'])([^"']*#[^"']+)\1/g; +/** + * Extract all anchor links from HTML using matchAll(). + * Safer than exec() loop with global regex - no shared state issues. + */ +function extractAnchorLinks(html: string): Array<{ fullLink: string }> { + return [...html.matchAll(ANCHOR_LINK_REGEX)].map(match => ({ + fullLink: match[2] + })); +} + /** * Register heading anchors for an entry. * @param entryPath - Absolute path like "/blog/my-post" @@ -51,10 +61,7 @@ export function registerAnchors(entryPath: string, headingIds: string[]): void { * @param html - HTML content to scan for links */ export function registerLinks(fromPath: string, html: string): void { - let match; - while ((match = ANCHOR_LINK_REGEX.exec(html)) !== null) { - const fullLink = match[2]; - + for (const { fullLink } of extractAnchorLinks(html)) { // Parse the link: "/blog/other#section" or "#section" const hashIndex = fullLink.indexOf('#'); if (hashIndex === -1) continue; From ff39ba741e197491b146b25858c1fbfce1c08b34 Mon Sep 17 00:00:00 2001 From: Johannes Hoppe Date: Sun, 8 Feb 2026 12:25:34 +0100 Subject: [PATCH 14/16] fix: skip external URLs in anchor link validation --- shared/link-validator.spec.ts | 28 ++++++++++++++++++++++++++++ shared/link-validator.ts | 14 ++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/shared/link-validator.spec.ts b/shared/link-validator.spec.ts index a0102d9..a04f13f 100644 --- a/shared/link-validator.spec.ts +++ b/shared/link-validator.spec.ts @@ -92,6 +92,34 @@ describe('link-validator', () => { expect(getLinks()).toHaveLength(2); }); + + it('should skip external https links', () => { + const html = 'External'; + registerLinks('/blog/my-post', html); + + expect(getLinks()).toHaveLength(0); + }); + + it('should skip external http links', () => { + const html = 'External'; + registerLinks('/blog/my-post', html); + + expect(getLinks()).toHaveLength(0); + }); + + it('should skip protocol-relative links', () => { + const html = 'External'; + registerLinks('/blog/my-post', html); + + expect(getLinks()).toHaveLength(0); + }); + + it('should skip mailto links', () => { + const html = 'Mail'; + registerLinks('/blog/my-post', html); + + expect(getLinks()).toHaveLength(0); + }); }); describe('validateLinks', () => { diff --git a/shared/link-validator.ts b/shared/link-validator.ts index 31c3bee..0285471 100644 --- a/shared/link-validator.ts +++ b/shared/link-validator.ts @@ -55,13 +55,27 @@ export function registerAnchors(entryPath: string, headingIds: string[]): void { anchorRegistry.set(entryPath, existing); } +/** + * Check if a URL is external (should not be validated). + */ +function isExternalUrl(url: string): boolean { + return /^https?:\/\//.test(url) || + url.startsWith('//') || + url.startsWith('mailto:') || + url.startsWith('tel:'); +} + /** * Extract anchor links from HTML and register them. + * Only registers internal links - external URLs are skipped. * @param fromPath - Entry path where links were found * @param html - HTML content to scan for links */ export function registerLinks(fromPath: string, html: string): void { for (const { fullLink } of extractAnchorLinks(html)) { + // Skip external URLs + if (isExternalUrl(fullLink)) continue; + // Parse the link: "/blog/other#section" or "#section" const hashIndex = fullLink.indexOf('#'); if (hashIndex === -1) continue; From 968aa530f4590ccdcd04069393a668bab09b0d29 Mon Sep 17 00:00:00 2001 From: Johannes Hoppe Date: Sun, 8 Feb 2026 13:20:06 +0100 Subject: [PATCH 15/16] fix: URL-decode anchors before validation --- shared/link-validator.spec.ts | 16 ++++++++++++++++ shared/link-validator.ts | 3 ++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/shared/link-validator.spec.ts b/shared/link-validator.spec.ts index a04f13f..4664d81 100644 --- a/shared/link-validator.spec.ts +++ b/shared/link-validator.spec.ts @@ -120,6 +120,22 @@ describe('link-validator', () => { expect(getLinks()).toHaveLength(0); }); + + it('should URL-decode anchor (ä encoded as %C3%A4)', () => { + // %C3%BC = ü in UTF-8 + const html = 'Link'; + registerLinks('/blog/my-post', html); + + // Anchor should be decoded to Unicode + expect(getLinks()[0].anchor).toBe('über-uns'); + }); + + it('should handle already decoded Unicode anchors', () => { + const html = 'Link'; + registerLinks('/blog/my-post', html); + + expect(getLinks()[0].anchor).toBe('grundsätzliches-zu-docker'); + }); }); describe('validateLinks', () => { diff --git a/shared/link-validator.ts b/shared/link-validator.ts index 0285471..c0515e8 100644 --- a/shared/link-validator.ts +++ b/shared/link-validator.ts @@ -81,7 +81,8 @@ export function registerLinks(fromPath: string, html: string): void { if (hashIndex === -1) continue; const pathPart = fullLink.substring(0, hashIndex); - const anchor = fullLink.substring(hashIndex + 1); + // URL-decode anchor (marked encodes special chars like ä → %C3%A4) + const anchor = decodeURIComponent(fullLink.substring(hashIndex + 1)); // Determine target path const toPath = pathPart || fromPath; // Empty path = same document From 14ce73ecf9839a22996e1245fc19d21b88d183bf Mon Sep 17 00:00:00 2001 From: Johannes Hoppe Date: Mon, 9 Feb 2026 18:59:18 +0100 Subject: [PATCH 16/16] feat: include h4 headings in [[toc]] generation --- shared/jekyll-markdown-parser.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/shared/jekyll-markdown-parser.ts b/shared/jekyll-markdown-parser.ts index 2783420..4a29c00 100644 --- a/shared/jekyll-markdown-parser.ts +++ b/shared/jekyll-markdown-parser.ts @@ -188,8 +188,8 @@ export class JekyllMarkdownParser { this.marked.parse(contentAfterMarker); const headings = getHeadingList(); - // Filter to h2 and h3 only - const relevantHeadings = headings.filter(h => h.level >= 2 && h.level <= 3); + // Filter to h2, h3, and h4 + const relevantHeadings = headings.filter(h => h.level >= 2 && h.level <= 4); if (relevantHeadings.length === 0) { return ''; @@ -207,7 +207,7 @@ export class JekyllMarkdownParser { // Generate markdown list return relevantHeadings .map(h => { - const indent = h.level === 3 ? ' ' : ''; + const indent = ' '.repeat(h.level - 2); // h2='', h3=' ', h4=' ' return `${indent}* [${h.text}](#${h.id})`; }) .join('\n');