diff --git a/README.md b/README.md index 0a80752..56f9150 100644 --- a/README.md +++ b/README.md @@ -1,58 +1,286 @@ # website-articles-build -Shared build scripts for processing Markdown articles into JSON. +Build system for blog and material articles. Transforms Markdown to JSON for Angular websites. -Used as a git subtree in: +Used as a Git submodule in: - [angular-buch/website-articles](https://github.com/angular-buch/website-articles) - [angular-schule/website-articles](https://github.com/angular-schule/website-articles) -## Usage +## Setup ```bash npm install -npm run build +npm run build # Single build +npm run watch # Watch mode for development +npm test # Run tests +npm run typecheck # TypeScript check ``` -## Scripts - -| Script | Description | -|--------------|--------------------------------------| -| `build` | Build blog and material entries | -| `test` | Run tests | -| `test:watch` | Run tests in watch mode | -| `typecheck` | TypeScript type checking | -| `watch` | Watch mode for development | - -## Folder Structure +## Project Structure ``` -├── build.ts # Main entry point +website-articles-build/ +├── build.ts # Main build script ├── blog/ -│ ├── blog.types.ts # Blog-specific types -│ └── blog.utils.ts # Blog list utilities +│ ├── blog.types.ts # Blog-specific types +│ └── blog.utils.ts # Blog-specific utilities ├── material/ -│ └── material.types.ts # Material-specific types +│ └── material.types.ts # Material-specific types └── shared/ - ├── base.types.ts # Shared base types - ├── base.utils.ts # File/folder utilities - ├── list.utils.ts # List extraction utilities - └── jekyll-markdown-parser.ts # Markdown parser + ├── jekyll-markdown-parser.ts # Markdown parser + ├── base.utils.ts # Shared utilities + └── list.utils.ts # List utilities +``` + +## Output + +The build generates for each article: + +| Output | Description | +|--------|-------------| +| `dist/blog/{slug}/entry.json` | Full article with HTML | +| `dist/blog/list.json` | List of all articles (light version) | +| `dist/material/{slug}/entry.json` | Full material entry | +| `dist/material/list.json` | List of all material entries | + +--- + +## Features for Markdown Authors + +### 1. Images + +Relative image paths are automatically transformed: + +```markdown +![Screenshot](screenshot.png) +![Logo](./images/logo.png) +``` + +**Build output:** +```html + +``` + +The placeholder `%%MARKDOWN_BASE_URL%%` is replaced at runtime by the Angular app (CDN on prod, proxy in dev). + +**Not transformed:** +- Absolute URLs: `https://example.com/image.png` +- Protocol-relative URLs: `//cdn.example.com/image.png` +- Asset paths: `assets/img/icon.svg` +- Absolute paths: `/images/logo.png` +- Data URIs: `data:image/png;base64,...` + +### 2. Links + +Relative links are transformed to absolute paths. This is necessary because our Angular website uses ``. + +#### Anchor Links (TOC) + +```markdown +[Introduction](#introduction) +``` + +**Build output:** +```html +Introduction ``` -## URL Placeholder +#### Cross-Article Links -Generated URLs use `%%MARKDOWN_BASE_URL%%` as a placeholder: -- `%%MARKDOWN_BASE_URL%%/blog/2024-post/image.png` -- `%%MARKDOWN_BASE_URL%%/material/chapter-1/diagram.svg` +```markdown +[Other Article](../other-article) +[Other Article with Anchor](../other-article#setup) +``` + +**Build output:** +```html +Other Article +Other Article with Anchor +``` + +**Not transformed:** +- Absolute URLs: `https://angular.io/docs` +- Already absolute paths: `/blog/other-article` +- mailto: `mailto:team@example.com` +- tel: `tel:+49123456` +- ftp: `ftp://files.example.com/file.zip` + +### 3. Automatic Table of Contents (TOC) + +Place `[[toc]]` in your Markdown to generate an automatic table of contents. + +#### Example + +```markdown +--- +title: My Article +published: 2024-01-15 +--- + +## Contents + +[[toc]] + +## Introduction -The consuming website replaces this placeholder with the actual base URL at runtime. +Lorem ipsum... -## Input/Output +### Subchapter -**Input:** `../blog/` and `../material/` folders with Markdown READMEs +More text... -**Output:** `../dist/` folder (parent directory) with: -- `../dist/blog/list.json` - Light blog list for overview -- `../dist/blog/{slug}/entry.json` - Full blog entry -- `../dist/material/list.json` - Light material list -- `../dist/material/{slug}/entry.json` - Full material entry +## Conclusion + +End. +``` + +#### Generated Output + +```html +

Contents

+ +``` + +#### Rules + +| Rule | Description | +|------|-------------| +| **Only h2 and h3** | h1 and h4+ are ignored | +| **After the marker** | Headings before `[[toc]]` are skipped | +| **Automatic IDs** | Heading IDs follow [GitHub's algorithm](https://github.com/Flet/github-slugger) | +| **Special characters** | Umlauts preserved (`Über uns` → `#über-uns`), `&` removed | + +### 4. Syntax Highlighting + +Code blocks are automatically formatted with highlight.js: + +````markdown +```typescript +const greeting = 'Hello World'; +console.log(greeting); +``` +```` + +### 5. Raw HTML + +HTML in Markdown is passed through unchanged: + +```markdown +
+

Custom styled content

+
+ + +``` + +**Security note:** This is intentional. We trust our own repository. There is no user-generated content. + +### 6. Emojis + +Emoji shortcodes are converted to Unicode: + +```markdown +Hello :smile: World :rocket: +``` + +**Output:** Hello 😄 World 🚀 + +--- + +## YAML Frontmatter + +Every article requires YAML frontmatter: + +```yaml +--- +title: "Article Title" +author: John Doe +mail: john@example.com +published: 2024-01-15 +language: en +header: header.jpg +keywords: + - Angular + - TypeScript +# Optional: +lastModified: 2024-02-01 +hidden: false # Don't show article in list +sticky: false # Pin article to top +darkenHeader: false +author2: Co-Author +mail2: co@example.com +bio: Short author bio +--- +``` + +### Date Formats + +Both formats are supported: + +```yaml +published: 2024-01-15 # Converted to ISO string +published: "2024-01-15T10:00:00Z" # Stays as string +``` + +--- + +## Development + +### Tests + +```bash +npm test # Single run +npm run test:watch # Watch mode +``` + +131 tests cover: +- Markdown parsing and HTML generation +- Image and link transformation +- TOC generation +- Edge cases (mailto, tel, CRLF, etc.) + +### TypeScript + +```bash +npm run typecheck # Type check +``` + +### Architecture + +``` +Markdown (README.md) + ↓ +JekyllMarkdownParser + ├── YAML Frontmatter → parsedYaml + ├── Markdown → marked → HTML + ├── Image URLs → transformed with placeholder + ├── Links → transformed to absolute paths + └── TOC → generated from headings + ↓ +entry.json +``` + +--- + +## Submodule Warning + +This repository is included as a Git submodule in `website-articles`. + +**Always make changes here**, not in the `build/` folder of the parent repo! + +```bash +# CORRECT: Work here +cd website-articles-build +git checkout -b feature/xyz + +# WRONG: Don't work in the submodule +cd website-articles/build # ❌ +``` diff --git a/build.ts b/build.ts index 53b257a..a3fe2c8 100644 --- a/build.ts +++ b/build.ts @@ -8,6 +8,7 @@ import { copyEntriesToDist, getEntryList } from './shared/base.utils'; import { makeLightBlogList } from './blog/blog.utils'; import { makeLightList } from './shared/list.utils'; import { MARKDOWN_BASE_URL_PLACEHOLDER } from './shared/jekyll-markdown-parser'; +import { printValidationResults } from './shared/link-validator'; const DIST_FOLDER = '../dist'; const BLOG_FOLDER = '../blog'; @@ -65,7 +66,11 @@ async function build(): Promise { await buildBlog(); await buildMaterial(); - console.log('Build complete!'); + // Validate all anchor links (warnings only, does not fail build) + console.log('\nValidating anchor links...'); + printValidationResults(); + + console.log('\nBuild complete!'); } build().catch((error) => { diff --git a/package-lock.json b/package-lock.json index 08cc5d5..4408364 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,20 +1,20 @@ { - "name": "website-articles", + "name": "website-articles-build", "version": "1.0.0", "lockfileVersion": 3, "requires": true, "packages": { "": { - "name": "website-articles", + "name": "website-articles-build", "version": "1.0.0", "license": "ISC", "dependencies": { "fs-extra": "^11.2.0", + "github-slugger": "^2.0.0", "highlight.js": "^11.10.0", "image-size": "^2.0.2", "js-yaml": "^4.1.0", "marked": "^17.0.1", - "marked-gfm-heading-id": "^4.1.3", "marked-highlight": "^2.2.3", "node-emoji": "^2.1.3" }, @@ -723,18 +723,6 @@ "node": ">= 20" } }, - "node_modules/marked-gfm-heading-id": { - "version": "4.1.3", - "resolved": "https://registry.npmjs.org/marked-gfm-heading-id/-/marked-gfm-heading-id-4.1.3.tgz", - "integrity": "sha512-aR0i63LmFbuxU/gAgrgz1Ir+8HK6zAIFXMlckeKHpV+qKbYaOP95L4Ux5Gi+sKmCZU5qnN2rdKpvpb7PnUBIWg==", - "license": "MIT", - "dependencies": { - "github-slugger": "^2.0.0" - }, - "peerDependencies": { - "marked": ">=13 <18" - } - }, "node_modules/marked-highlight": { "version": "2.2.3", "resolved": "https://registry.npmjs.org/marked-highlight/-/marked-highlight-2.2.3.tgz", diff --git a/package.json b/package.json index 0c19d94..c87617e 100644 --- a/package.json +++ b/package.json @@ -16,8 +16,8 @@ "highlight.js": "^11.10.0", "image-size": "^2.0.2", "js-yaml": "^4.1.0", + "github-slugger": "^2.0.0", "marked": "^17.0.1", - "marked-gfm-heading-id": "^4.1.3", "marked-highlight": "^2.2.3", "node-emoji": "^2.1.3" }, diff --git a/shared/base.types.ts b/shared/base.types.ts index e3e2f4e..4819076 100644 --- a/shared/base.types.ts +++ b/shared/base.types.ts @@ -19,6 +19,7 @@ export interface EntryMetaBase { } export interface EntryBase { + /** URL-friendly identifier derived from folder name: "2024-01-my-post" */ slug: string; html: string; meta: EntryMetaBase; diff --git a/shared/base.utils.spec.ts b/shared/base.utils.spec.ts index be45393..0e57335 100644 --- a/shared/base.utils.spec.ts +++ b/shared/base.utils.spec.ts @@ -250,7 +250,8 @@ describe('base.utils', () => { markdown, 'test-entry', 'https://example.com/', - '/non/existent/path' + '/non/existent/path', + '/blog/test-entry' )).rejects.toThrow(); }); @@ -261,7 +262,8 @@ describe('base.utils', () => { markdown, 'test-entry', 'https://example.com/', - '/tmp' + '/tmp', + '/blog/test-entry' ); // node-emoji converts :smile: to 😄 and :rocket: to 🚀 @@ -278,7 +280,8 @@ describe('base.utils', () => { markdown, 'test-entry', 'https://example.com/', - '/tmp' + '/tmp', + '/blog/test-entry' ); // js-yaml parses unquoted dates as Date objects, but we convert to ISO string @@ -293,7 +296,8 @@ describe('base.utils', () => { markdown, 'my-awesome-post', 'https://example.com/', - '/tmp' + '/tmp', + '/blog/my-awesome-post' ); expect(result.slug).toBe('my-awesome-post'); diff --git a/shared/base.utils.ts b/shared/base.utils.ts index 3d35f1f..6646674 100644 --- a/shared/base.utils.ts +++ b/shared/base.utils.ts @@ -6,6 +6,7 @@ import { copy, remove, writeJson, mkdirp } from 'fs-extra'; import { JekyllMarkdownParser } from './jekyll-markdown-parser'; import { EntryBase, ImageDimensions } from './base.types'; +import { registerAnchors, registerLinks } from './link-validator'; const README_FILE = 'README.md'; const ENTRY_FILE = 'entry.json'; @@ -53,10 +54,20 @@ export async function copyEntriesToDist( } } -/** Simple way to sort things: create a sort key that can be easily sorted */ -function getSortKey(entry: EntryBase): string { - // ISO 8601 strings sort correctly in lexicographic order - return (entry.meta.sticky ? 'Z' : 'A') + '---' + entry.meta.published + '---' + entry.slug; +/** + * Compare two entries for sorting (newest first, sticky on top). + * @returns negative if a comes first, positive if b comes first + */ +function compareEntries(a: EntryBase, b: EntryBase): number { + // 1. Sticky entries first + if (a.meta.sticky !== b.meta.sticky) { + return a.meta.sticky ? -1 : 1; + } + // 2. Then by date (newest first) - ISO 8601 strings sort lexicographically + const dateCompare = b.meta.published.localeCompare(a.meta.published); + if (dateCompare !== 0) return dateCompare; + // 3. Slug as tiebreaker (descending) + return b.slug.localeCompare(a.slug); } @@ -76,12 +87,18 @@ export async function markdownToEntry( markdown: string, folder: string, baseUrl: string, - blogPostsFolder: string + blogPostsFolder: string, + linkBasePath: string ): Promise { - const parser = new JekyllMarkdownParser(baseUrl + folder + '/'); - const parsedJekyllMarkdown = parser.parse(markdown); + const imageBaseUrl = baseUrl + folder + '/'; + const parser = new JekyllMarkdownParser(imageBaseUrl, linkBasePath); + const { html, parsedYaml, headingIds } = parser.parse(markdown); - const meta: Record = parsedJekyllMarkdown.parsedYaml; + // Register anchors and links for validation + registerAnchors(linkBasePath, headingIds); + registerLinks(linkBasePath, html); + + const meta: Record = parsedYaml; // Convert Date objects from js-yaml to ISO strings // js-yaml parses unquoted dates (e.g., `published: 2024-01-15`) as Date objects @@ -103,7 +120,7 @@ export async function markdownToEntry( // Type assertion: we trust that YAML contains all required properties for T return { slug: folder, - html: emoji.emojify(parsedJekyllMarkdown.html), + html: emoji.emojify(html), meta } as unknown as T; } @@ -113,12 +130,16 @@ export async function getEntryList(entriesFolder: string, m const entryDirs = await readFolders(entriesFolder); const entries: T[] = []; + // Content type from folder structure: ../blog → blog, ../material → material + const contentType = path.basename(entriesFolder); + for (const entryDir of entryDirs) { const readmePath = path.join(entriesFolder, entryDir, README_FILE); const readme = await readMarkdownFile(readmePath); - const entry = await markdownToEntry(readme, entryDir, markdownBaseUrl, entriesFolder); + const linkBasePath = '/' + contentType + '/' + entryDir; + const entry = await markdownToEntry(readme, entryDir, markdownBaseUrl, entriesFolder, linkBasePath); entries.push(entry); } - return entries.sort((a, b) => getSortKey(b).localeCompare(getSortKey(a))); + return entries.sort(compareEntries); } diff --git a/shared/gfm-heading-id.spec.ts b/shared/gfm-heading-id.spec.ts new file mode 100644 index 0000000..d2f7fbd --- /dev/null +++ b/shared/gfm-heading-id.spec.ts @@ -0,0 +1,230 @@ +/** + * Tests for gfm-heading-id.ts + * + * Adapted from: https://github.com/markedjs/marked-gfm-heading-id + * Original tests by marked team, MIT license. + * + * Key behaviors tested: + * 1. ID generation with github-slugger + * 2. Heading list collection + * 3. Reset functionality + * 4. text (with HTML) vs raw (plain text) separation + */ + +import { describe, it, expect, beforeEach } from 'vitest'; +import { Marked } from 'marked'; +import { gfmHeadingId, getHeadingList, resetHeadings, HeadingData } from './gfm-heading-id'; + +describe('gfm-heading-id', () => { + let marked: Marked; + + beforeEach(() => { + resetHeadings(); + marked = new Marked(gfmHeadingId()); + }); + + describe('ID generation', () => { + it('should generate lowercase slugified IDs', () => { + marked.parse('# Hello World'); + expect(getHeadingList()[0].id).toBe('hello-world'); + }); + + it('should increment IDs for duplicate headings', () => { + marked.parse('# foo\n\n# foo\n\n# foo'); + const headings = getHeadingList(); + + expect(headings[0].id).toBe('foo'); + expect(headings[1].id).toBe('foo-1'); + expect(headings[2].id).toBe('foo-2'); + }); + + it('should handle heading text that looks like an ID suffix', () => { + // "foo 1" as text should not conflict with "foo-1" as auto-suffix + marked.parse('# foo 1\n\n# foo\n\n# foo'); + const headings = getHeadingList(); + + expect(headings[0].id).toBe('foo-1'); // "foo 1" → "foo-1" + expect(headings[1].id).toBe('foo'); // first "foo" + expect(headings[2].id).toBe('foo-2'); // second "foo" → "foo-2" (not foo-1!) + }); + + it('should support prefix option', () => { + marked = new Marked(gfmHeadingId({ prefix: 'custom-' })); + marked.parse('# Test'); + expect(getHeadingList()[0].id).toBe('custom-test'); + }); + + it('should handle German umlauts (github-slugger behavior)', () => { + marked.parse('# Über uns'); + expect(getHeadingList()[0].id).toBe('über-uns'); + }); + + it('should handle special characters in headings', () => { + marked.parse('# FAQ & Hilfe'); + // github-slugger removes & but keeps surrounding chars + expect(getHeadingList()[0].id).toBe('faq--hilfe'); + }); + }); + + describe('getHeadingList()', () => { + it('should collect all headings with correct levels', () => { + marked.parse('# H1\n## H2\n### H3\n#### H4\n##### H5\n###### H6'); + const headings = getHeadingList(); + + expect(headings).toHaveLength(6); + expect(headings.map(h => h.level)).toEqual([1, 2, 3, 4, 5, 6]); + }); + + it('should return HeadingData with all required properties', () => { + marked.parse('## Test Heading'); + const heading = getHeadingList()[0]; + + expect(heading).toMatchObject({ + level: 2, + text: 'Test Heading', + raw: 'Test Heading', + id: 'test-heading' + }); + }); + + it('should clear list on each new parse (preprocess hook)', () => { + marked.parse('# First'); + expect(getHeadingList()).toHaveLength(1); + + marked.parse('# Second\n## Third'); + expect(getHeadingList()).toHaveLength(2); + expect(getHeadingList()[0].raw).toBe('Second'); + }); + }); + + describe('resetHeadings()', () => { + it('should clear heading list when called manually', () => { + marked.parse('# Test'); + expect(getHeadingList()).toHaveLength(1); + + resetHeadings(); + expect(getHeadingList()).toHaveLength(0); + }); + + it('should reset slugger counter', () => { + marked.parse('# foo\n\n# foo'); + expect(getHeadingList()[1].id).toBe('foo-1'); + + resetHeadings(); + // Slug counter should start fresh + marked.parse('# foo\n\n# foo'); + expect(getHeadingList()[1].id).toBe('foo-1'); // NOT foo-3 + }); + }); + + describe('text vs raw separation (OUR KEY IMPROVEMENT)', () => { + /** + * This is the main value of our fork: + * - text: preserves HTML as rendered by marked (for display) + * - raw: plain text with HTML stripped and entities decoded (for TOC) + */ + + it('should preserve HTML in text but strip from raw', () => { + marked.parse('# Hello **world**'); + const heading = getHeadingList()[0]; + + expect(heading.text).toBe('Hello world'); + expect(heading.raw).toBe('Hello world'); + }); + + it('should strip inline code tags from raw', () => { + marked.parse('# Using `npm install`'); + const heading = getHeadingList()[0]; + + expect(heading.text).toContain('npm install'); + expect(heading.raw).toBe('Using npm install'); + }); + + it('should strip nested HTML tags from raw', () => { + marked.parse('# Hello world!'); + const heading = getHeadingList()[0]; + + expect(heading.text).toContain(''); + expect(heading.text).toContain(''); + expect(heading.raw).toBe('Hello world!'); + }); + + it('should decode & entities in raw', () => { + // When marked renders bold, it may produce entities + marked.parse('# Tom **&** Jerry'); // literal & in markdown + const heading = getHeadingList()[0]; + + // The & should be decoded in raw, not show as & + expect(heading.raw).toBe('Tom & Jerry'); + }); + + it('should decode " entities in raw', () => { + // Test actual entity handling - marked escapes quotes in certain contexts + marked.parse('# Title with **"quotes"**'); + const heading = getHeadingList()[0]; + + expect(heading.raw).toBe('Title with "quotes"'); + }); + + it('should decode ' and ' entities (single quotes)', () => { + // Create a heading where marked produces ' + marked.parse("# It's **fine**"); + const heading = getHeadingList()[0]; + + expect(heading.raw).toBe("It's fine"); + }); + }); + + describe('edge cases', () => { + it('should handle empty heading', () => { + marked.parse('# '); + const heading = getHeadingList()[0]; + + expect(heading.raw).toBe(''); + expect(heading.id).toBe(''); + }); + + it('should handle HTML comment in heading (stripped by marked)', () => { + marked.parse('# visible text'); + const heading = getHeadingList()[0]; + + // marked v17 strips comments entirely, including surrounding whitespace + expect(heading.raw).toBe('visible text'); + }); + + it('should handle raw HTML that looks like a tag (treated as HTML)', () => { + // is valid HTML, so marked treats it as such + marked.parse('# Text with emphasis'); + const heading = getHeadingList()[0]; + + expect(heading.text).toContain(''); + expect(heading.raw).toBe('Text with emphasis'); + }); + + it('should handle invalid HTML-like content', () => { + // { + it('should produce heading with id attribute', () => { + const html = marked.parse('# Test'); + expect(html).toContain('

Test

'); + }); + + it('should include newline after heading', () => { + const html = marked.parse('# Test'); + expect(html).toBe('

Test

\n'); + }); + + it('should preserve inline HTML in output', () => { + const html = marked.parse('# Hello **world**'); + expect(html).toContain('world'); + }); + }); +}); diff --git a/shared/gfm-heading-id.ts b/shared/gfm-heading-id.ts new file mode 100644 index 0000000..cc4bd46 --- /dev/null +++ b/shared/gfm-heading-id.ts @@ -0,0 +1,120 @@ +/** + * GitHub Flavored Markdown Heading ID Extension for Marked + * + * Forked from: https://github.com/markedjs/marked-gfm-heading-id (v4.1.3) + * Original license: MIT + * + * ============================================================================= + * WHY WE FORKED + * ============================================================================= + * + * The original package only provides the heading ID. We need more for TOC + * generation: both the formatted text (with HTML) AND the plain text. + * + * HeadingData provides two representations: + * + * text: "Using npm install" ← HTML preserved (for TOC links) + * raw: "Using npm install" ← Plain text (for slug generation) + * + * Example: Markdown heading `## Using \`npm install\`` + * + * 1. marked renders: "Using npm install" + * 2. We store this as `text` (used in TOC link display) + * 3. We strip HTML + decode entities → `raw` (used for ID generation) + * 4. github-slugger creates ID from raw → "using-npm-install" + * + * The TOC then shows formatted links: + * + * Using npm install + * + * Without our fork, TOC links would lose all formatting: + * + * Using npm install ← formatting lost! + * + * ============================================================================= + * CHANGES FROM ORIGINAL + * ============================================================================= + * + * - Converted to TypeScript + * - Simplified API (removed globalSlugs option - we always reset per document) + * - Added `text` field: HTML as rendered by marked (preserves , , etc.) + * - Added `raw` field: Plain text with HTML stripped and entities decoded + * - Slug generation uses `raw` (decoded text, not HTML) + */ + +import GithubSlugger from 'github-slugger'; +import type { MarkedExtension, Tokens } from 'marked'; +import { decodeHtmlEntities, stripHtmlTags } from './html.utils'; + +export interface HeadingData { + /** Heading level (1-6) */ + level: number; + /** Formatted text with HTML preserved: "Using npm" */ + text: string; + /** Plain text (HTML stripped, entities decoded): "Using npm" */ + raw: string; + /** Anchor ID for linking: "using-npm" (generated by github-slugger) */ + id: string; +} + +let slugger = new GithubSlugger(); +let headings: HeadingData[] = []; + +/** + * Create a marked extension that adds GitHub-style heading IDs. + * + * @param options.prefix - Optional prefix for all heading IDs + * @returns MarkedExtension to pass to marked.use() + * + * @example + * ```typescript + * const marked = new Marked(gfmHeadingId()); + * marked.parse('# Hello World'); + * //

Hello World

+ * ``` + */ +export function gfmHeadingId({ prefix = '' } = {}): MarkedExtension { + return { + hooks: { + preprocess(src: string): string { + // Always reset for each document (we process one doc at a time) + resetHeadings(); + return src; + }, + }, + renderer: { + heading({ tokens, depth }: Tokens.Heading): string { + // Get the rendered HTML text (may contain HTML entities and tags) + // @ts-ignore - 'this' context is provided by marked at runtime + const text: string = this.parser.parseInline(tokens); + + // Get raw text: decode entities, strip HTML tags + const raw = stripHtmlTags(decodeHtmlEntities(text)).trim(); + + const level = depth; + const id = `${prefix}${slugger.slug(raw)}`; + + headings.push({ level, text, id, raw }); + + return `${text}\n`; + }, + }, + }; +} + +/** + * Get the list of headings collected during the last parse. + * Call this after marked.parse() to get heading data for TOC generation. + */ +export function getHeadingList(): HeadingData[] { + return headings; +} + +/** + * Reset the heading list and slugger. + * Called automatically in preprocess hook, but can be called manually if needed. + */ +export function resetHeadings(): void { + headings = []; + slugger = new GithubSlugger(); +} diff --git a/shared/html.utils.spec.ts b/shared/html.utils.spec.ts new file mode 100644 index 0000000..fc590cb --- /dev/null +++ b/shared/html.utils.spec.ts @@ -0,0 +1,186 @@ +import { describe, it, expect } from 'vitest'; +import { stripHtmlTags, decodeHtmlEntities, escapeHtml } from './html.utils'; + +describe('stripHtmlTags', () => { + it('should return empty string for empty input', () => { + expect(stripHtmlTags('')).toBe(''); + }); + + it('should return text unchanged when no HTML tags present', () => { + expect(stripHtmlTags('Hello World')).toBe('Hello World'); + }); + + it('should strip simple HTML tags', () => { + expect(stripHtmlTags('

Hello

')).toBe('Hello'); + }); + + it('should strip tags with attributes', () => { + expect(stripHtmlTags('Link')).toBe('Link'); + }); + + it('should strip multiple tags', () => { + expect(stripHtmlTags('

Hello

World
')).toBe('HelloWorld'); + }); + + it('should strip self-closing tags', () => { + expect(stripHtmlTags('Before
After')).toBe('BeforeAfter'); + expect(stripHtmlTags('Before
After')).toBe('BeforeAfter'); + }); + + it('should strip img tags', () => { + expect(stripHtmlTags('Test')).toBe(''); + }); + + it('should preserve text between tags', () => { + expect(stripHtmlTags('Bold and italic')).toBe('Bold and italic'); + }); + + it('should handle nested tags', () => { + expect(stripHtmlTags('

Deep

')).toBe('Deep'); + }); + + it('should handle tags with multiple attributes', () => { + expect(stripHtmlTags('')).toBe(''); + }); + + it('should preserve whitespace between tags', () => { + expect(stripHtmlTags('

Hello

World

')).toBe('Hello World'); + }); + + it('should handle HTML comments by stripping them', () => { + expect(stripHtmlTags('BeforeAfter')).toBe('BeforeAfter'); + }); +}); + +describe('decodeHtmlEntities', () => { + it('should return empty string for empty input', () => { + expect(decodeHtmlEntities('')).toBe(''); + }); + + it('should return text unchanged when no entities present', () => { + expect(decodeHtmlEntities('Hello World')).toBe('Hello World'); + }); + + it('should decode & to &', () => { + expect(decodeHtmlEntities('Tom & Jerry')).toBe('Tom & Jerry'); + }); + + it('should decode < to <', () => { + expect(decodeHtmlEntities('a < b')).toBe('a < b'); + }); + + it('should decode > to >', () => { + expect(decodeHtmlEntities('a > b')).toBe('a > b'); + }); + + it('should decode " to "', () => { + expect(decodeHtmlEntities('He said "hello"')).toBe('He said "hello"'); + }); + + it('should decode ' to single quote', () => { + expect(decodeHtmlEntities("It's fine")).toBe("It's fine"); + }); + + it('should decode ' to single quote', () => { + expect(decodeHtmlEntities("It's fine")).toBe("It's fine"); + }); + + it('should decode / to /', () => { + expect(decodeHtmlEntities('path/to/file')).toBe('path/to/file'); + }); + + it('should decode multiple entities in one string', () => { + expect(decodeHtmlEntities('<div class="test">')).toBe('
'); + }); + + it('should handle entities at start and end', () => { + expect(decodeHtmlEntities('&start and end&')).toBe('&start and end&'); + }); + + it('should handle multiple consecutive same entities', () => { + expect(decodeHtmlEntities('&&&')).toBe('&&&'); + }); + + it('should decode Array pattern (common in code)', () => { + expect(decodeHtmlEntities('Array<string>')).toBe('Array'); + }); + + it('should decode generic TypeScript code pattern', () => { + expect(decodeHtmlEntities('Map<string, number>')).toBe('Map'); + }); +}); + +describe('escapeHtml', () => { + it('should return empty string for empty input', () => { + expect(escapeHtml('')).toBe(''); + }); + + it('should return text unchanged when no special chars present', () => { + expect(escapeHtml('Hello World')).toBe('Hello World'); + }); + + it('should escape & to &', () => { + expect(escapeHtml('Tom & Jerry')).toBe('Tom & Jerry'); + }); + + it('should escape " to "', () => { + expect(escapeHtml('He said "hello"')).toBe('He said "hello"'); + }); + + it('should escape < to <', () => { + expect(escapeHtml('a < b')).toBe('a < b'); + }); + + it('should escape > to >', () => { + expect(escapeHtml('a > b')).toBe('a > b'); + }); + + it("should escape ' to '", () => { + expect(escapeHtml("It's fine")).toBe("It's fine"); + }); + + it('should escape all special characters in one string', () => { + expect(escapeHtml('
&
')).toBe('<div class="test">&</div>'); + }); + + it('should handle multiple ampersands correctly', () => { + // Ampersands must be escaped first to avoid double-escaping + expect(escapeHtml('a & b & c')).toBe('a & b & c'); + }); + + it('should escape HTML tag patterns', () => { + expect(escapeHtml('')).toBe('<script>alert("xss")</script>'); + }); + + it('should escape TypeScript generic syntax', () => { + expect(escapeHtml('Array')).toBe('Array<string>'); + }); + + it('should be reversible with decodeHtmlEntities', () => { + const original = 'Tom & Jerry <3 "quotes"'; + const escaped = escapeHtml(original); + const decoded = decodeHtmlEntities(escaped); + expect(decoded).toBe(original); + }); +}); + +describe('escapeHtml and decodeHtmlEntities roundtrip', () => { + const testCases = [ + 'Simple text', + 'Tom & Jerry', + 'a < b > c', + 'He said "hello"', + "It's fine", + '
Content & more
', + 'Array>', + '& already encoded', + ]; + + testCases.forEach((input) => { + it(`should roundtrip: ${input.substring(0, 30)}...`, () => { + const escaped = escapeHtml(input); + const decoded = decodeHtmlEntities(escaped); + expect(decoded).toBe(input); + }); + }); +}); diff --git a/shared/html.utils.ts b/shared/html.utils.ts new file mode 100644 index 0000000..8aeb971 --- /dev/null +++ b/shared/html.utils.ts @@ -0,0 +1,37 @@ +/** + * Shared HTML utility functions. + */ + +/** + * Strip all HTML tags from a string, leaving only text content. + */ +export function stripHtmlTags(html: string): string { + return html.replace(/<[^>]*>/g, ''); +} + +/** + * Decode common HTML entities to their original characters. + */ +export function decodeHtmlEntities(html: string): string { + return html + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/'/g, "'") + .replace(///g, '/'); +} + +/** + * Escape special HTML characters for use in attribute values. + * Escapes: & " ' < > + */ +export function escapeHtml(text: string): string { + return text + .replace(/&/g, '&') + .replace(/"/g, '"') + .replace(/'/g, ''') + .replace(//g, '>'); +} diff --git a/shared/jekyll-markdown-parser.spec.ts b/shared/jekyll-markdown-parser.spec.ts index a8ccd21..5ab4aec 100644 --- a/shared/jekyll-markdown-parser.spec.ts +++ b/shared/jekyll-markdown-parser.spec.ts @@ -1,9 +1,9 @@ import { describe, it, expect } from 'vitest'; import { Marked } from 'marked'; import { markedHighlight } from 'marked-highlight'; -import { gfmHeadingId } from 'marked-gfm-heading-id'; +import { gfmHeadingId } from './gfm-heading-id'; import hljs from 'highlight.js'; -import { JekyllMarkdownParser, MARKDOWN_BASE_URL_PLACEHOLDER } from './jekyll-markdown-parser'; +import { JekyllMarkdownParser, MARKDOWN_BASE_URL_PLACEHOLDER, TOC_MARKER } from './jekyll-markdown-parser'; /** * Create a Marked instance with the same extensions as JekyllMarkdownParser. @@ -385,6 +385,7 @@ describe('Configured marked behavior (baseline)', () => { */ describe('JekyllMarkdownParser', () => { const baseUrl = 'https://example.com/blog/my-post/'; + const linkBasePath = '/blog/my-post'; describe('Comprehensive regression test (marked upgrade safety)', () => { /** @@ -392,7 +393,7 @@ describe('JekyllMarkdownParser', () => { * If this test fails, the upgrade broke something important! */ it('should produce expected output for comprehensive blog post', () => { - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const { parsedYaml, html } = parser.parse(COMPREHENSIVE_BLOG_POST); // === YAML Frontmatter === @@ -465,7 +466,7 @@ describe('JekyllMarkdownParser', () => { * 3. Update EXPECTED_HTML_WITH_IMAGE_TRANSFORM only if the change is intentional */ it('should produce EXACT HTML output (character-by-character)', () => { - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(COMPREHENSIVE_BLOG_POST); expect(result.html).toBe(EXPECTED_HTML_WITH_IMAGE_TRANSFORM); @@ -483,19 +484,18 @@ author: John Doe This is a test. `; - const parser = new JekyllMarkdownParser(baseUrl); - const { parsedYaml, html, markdown } = parser.parse(input); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const { parsedYaml, html } = parser.parse(input); expect(parsedYaml.title).toBe('Test Post'); expect(parsedYaml.author).toBe('John Doe'); expect(html).toContain('

Hello World

'); expect(html).toContain('

This is a test.

'); - expect(markdown).toBe('\n# Hello World\n\nThis is a test.\n'); }); it('should throw for markdown without frontmatter', () => { const input = '# Just Markdown\n\nNo frontmatter here.'; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); expect(() => parser.parse(input)).toThrow('YAML frontmatter is required'); }); @@ -509,7 +509,7 @@ title: Test ![Alt text](image.png) `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain(`src="${baseUrl}image.png"`); @@ -523,7 +523,7 @@ title: Test ![Alt text](image.png "Image Title") `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('
'); @@ -540,7 +540,7 @@ title: Test ![External](https://other.com/image.png) `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('src="https://other.com/image.png"'); @@ -554,7 +554,7 @@ title: Test ![Alt](./image.png) `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain(`src="${baseUrl}image.png"`); @@ -568,7 +568,7 @@ title: Test ![Data](data:image/png;base64,ABC123) `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('src="data:image/png;base64,ABC123"'); @@ -581,7 +581,7 @@ title: Test ![Icon](assets/img/icon.svg) `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('src="assets/img/icon.svg"'); @@ -597,7 +597,7 @@ title: Test Photo `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain(`src="${baseUrl}photo.jpg"`); @@ -611,7 +611,7 @@ title: Test Photo `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain(`src="${baseUrl}photo.jpg"`); @@ -625,7 +625,7 @@ title: Test Photo `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain(`src="${baseUrl}images/photo.jpg"`); @@ -638,7 +638,7 @@ title: Test External `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('src="https://other.com/image.png"'); @@ -652,7 +652,7 @@ title: Test CDN `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('src="//cdn.example.com/image.png"'); @@ -666,7 +666,7 @@ title: Test Data `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('src="data:image/png;base64,ABC123"'); @@ -680,7 +680,7 @@ title: Test Icon `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('src="assets/img/icon.svg"'); @@ -694,7 +694,7 @@ title: Test Logo `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('src="/images/logo.png"'); @@ -708,7 +708,7 @@ title: Test Photo `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain(`src="${baseUrl}photo.jpg"`); @@ -726,7 +726,7 @@ title: Test Second Third `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain(`src="${baseUrl}first.jpg"`); @@ -741,7 +741,7 @@ title: Test Photo `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain(`src='${baseUrl}photo.jpg'`); @@ -755,7 +755,7 @@ title: Test External `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain("src='https://example.com/external.png'"); @@ -776,7 +776,7 @@ title: Test This has highlighted text and HTML. `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('highlighted'); @@ -792,7 +792,7 @@ title: Test

Custom styled content

`; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('
'); @@ -806,7 +806,7 @@ title: Test A special image `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('class="rounded shadow"'); @@ -824,7 +824,7 @@ title: Test Example \`\`\` `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); // Code is escaped and syntax-highlighted by highlight.js @@ -845,7 +845,7 @@ title: Test Code \`\`\` `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain(`src="${baseUrl}real-image.jpg"`); @@ -869,7 +869,7 @@ title: Test ![HTTP Image](http://insecure.com/image.png) `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('src="http://insecure.com/image.png"'); @@ -883,7 +883,7 @@ title: Test HTTP `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('src="http://insecure.com/image.png"'); @@ -899,7 +899,7 @@ title: Test ![He said "hello"](image.png) `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); // Quotes should be escaped to prevent broken HTML @@ -918,7 +918,7 @@ title: Test ![Alt](image.png "Title with "quotes"") `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); // Marked does NOT parse this as an image - it becomes literal text @@ -933,7 +933,7 @@ title: Test ![Array](image.png) `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('alt="Array<string>"'); @@ -946,7 +946,7 @@ title: Test ![Tom & Jerry](image.png) `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('alt="Tom & Jerry"'); @@ -956,7 +956,7 @@ title: Test describe('YAML frontmatter edge cases', () => { it('should handle Windows line endings (CRLF)', () => { const input = '---\r\ntitle: Test\r\n---\r\n\r\n# Hello'; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const { parsedYaml, html } = parser.parse(input); expect(parsedYaml.title).toBe('Test'); @@ -966,7 +966,7 @@ title: Test it('should throw for only one separator (no valid frontmatter)', () => { const input = '---\nThis is not YAML, just a horizontal rule\n\n# Hello'; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); expect(() => parser.parse(input)).toThrow('YAML frontmatter is required'); }); @@ -982,7 +982,7 @@ title: Test This is after a horizontal rule. `; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.parsedYaml.title).toBe('Test'); @@ -992,7 +992,7 @@ This is after a horizontal rule. it('should handle trailing whitespace after --- separator', () => { const input = '--- \ntitle: Test\n---\t\n\n# Hello'; - const parser = new JekyllMarkdownParser(baseUrl); + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); const result = parser.parse(input); expect(result.parsedYaml.title).toBe('Test'); @@ -1008,13 +1008,14 @@ This is after a horizontal rule. // 2. transformRelativeImagePaths runs on the ENTIRE HTML output // 3. It must NOT add baseUrl again to URLs that already start with the placeholder const placeholderBaseUrl = `${MARKDOWN_BASE_URL_PLACEHOLDER}/blog/my-post/`; + const placeholderLinkPath = '/blog/my-post'; const input = `--- title: Test --- ![Screenshot](screenshot.png) `; - const parser = new JekyllMarkdownParser(placeholderBaseUrl); + const parser = new JekyllMarkdownParser(placeholderBaseUrl, placeholderLinkPath); const result = parser.parse(input); // Should have exactly ONE placeholder prefix, not two! @@ -1025,13 +1026,14 @@ title: Test it('should NOT double-prefix raw HTML images with placeholder in src', () => { // Edge case: What if someone manually writes the placeholder in HTML? const placeholderBaseUrl = `${MARKDOWN_BASE_URL_PLACEHOLDER}/blog/my-post/`; + const placeholderLinkPath = '/blog/my-post'; const input = `--- title: Test --- Already prefixed `; - const parser = new JekyllMarkdownParser(placeholderBaseUrl); + const parser = new JekyllMarkdownParser(placeholderBaseUrl, placeholderLinkPath); const result = parser.parse(input); // Should NOT add another prefix @@ -1040,6 +1042,284 @@ title: Test }); }); + describe('Relative link transformation', () => { + it('should transform #anchor to absolute path', () => { + const input = `--- +title: Test +--- + +Check the [introduction](#introduction) section. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="/blog/my-post#introduction"'); + }); + + it('should transform ../sibling-slug to absolute path', () => { + const input = `--- +title: Test +--- + +See [other article](../other-post) for more. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="/blog/other-post"'); + }); + + it('should transform ../sibling-slug#section to absolute path with anchor', () => { + const input = `--- +title: Test +--- + +See [Angular 10](../2020-06-angular10#setup) for details. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="/blog/2020-06-angular10#setup"'); + }); + + it('should NOT transform external https:// links', () => { + const input = `--- +title: Test +--- + +Check [Angular docs](https://angular.io/docs) for more. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="https://angular.io/docs"'); + }); + + it('should NOT transform external http:// links', () => { + const input = `--- +title: Test +--- + +Check [old site](http://example.com) for more. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="http://example.com"'); + }); + + it('should NOT transform already-absolute paths starting with /', () => { + const input = `--- +title: Test +--- + +Check [another post](/blog/2023-01-other-post) for more. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="/blog/2023-01-other-post"'); + }); + + it('should NOT transform already-absolute paths with hash', () => { + const input = `--- +title: Test +--- + +Check [section](/blog/2023-01-other-post#setup) for more. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="/blog/2023-01-other-post#setup"'); + }); + + it('should NOT transform absolute paths in raw HTML anchor tags', () => { + const input = `--- +title: Test +--- + +Other post +Section link +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="/blog/other-post"'); + expect(result.html).toContain('href="/blog/other-post#section"'); + }); + + it('should NOT transform https:// links in raw HTML anchor tags', () => { + const input = `--- +title: Test +--- + +Angular Docs +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="https://angular.io/guide/components"'); + }); + + it('should NOT transform mailto: links', () => { + const input = `--- +title: Test +--- + +Contact us at [team@example.com](mailto:team@example.com). +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="mailto:team@example.com"'); + }); + + it('should NOT transform tel: links', () => { + const input = `--- +title: Test +--- + +Call us at [+49 123 456](tel:+49123456). +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="tel:+49123456"'); + }); + + it('should NOT transform ftp:// links', () => { + const input = `--- +title: Test +--- + +Download from [FTP](ftp://files.example.com/file.zip). +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="ftp://files.example.com/file.zip"'); + }); + + it('should NOT transform mailto: in raw HTML', () => { + const input = `--- +title: Test +--- + +Mail +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="mailto:team@angular-buch.com"'); + }); + + it('should transform ./relative links to current path', () => { + const input = `--- +title: Test +--- + +See [local file](./diagram.svg) for illustration. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="/blog/my-post/diagram.svg"'); + }); + + it('should transform multiple anchor links in TOC', () => { + const input = `--- +title: Test +--- + +## Inhalt + +- [Einleitung](#einleitung) +- [Hauptteil](#hauptteil) +- [Fazit](#fazit) +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="/blog/my-post#einleitung"'); + expect(result.html).toContain('href="/blog/my-post#hauptteil"'); + expect(result.html).toContain('href="/blog/my-post#fazit"'); + }); + + it('should handle raw HTML anchor tags with relative hrefs', () => { + const input = `--- +title: Test +--- + +Jump to section +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="/blog/my-post#section"'); + }); + + it('should preserve other attributes on anchor tags', () => { + const input = `--- +title: Test +--- + +Section +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="/blog/my-post#section"'); + expect(result.html).toContain('class="nav-link"'); + expect(result.html).toContain('id="toc-1"'); + }); + + it('should work with material paths', () => { + const materialLinkPath = '/material/signal-forms'; + const input = `--- +title: Test +--- + +See [other material](../other-material#section) for more. +`; + const parser = new JekyllMarkdownParser(baseUrl, materialLinkPath); + const result = parser.parse(input); + + expect(result.html).toContain('href="/material/other-material#section"'); + }); + + it('should handle deeply nested relative paths', () => { + const input = `--- +title: Test +--- + +See [root](../../other) for more. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="/other"'); + }); + + it('should NOT transform links inside code blocks', () => { + const input = `--- +title: Test +--- + +\`\`\`html +Link in code +\`\`\` +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + // Code is escaped by highlight.js, so the link should not be transformed + // The important assertion: no transformed href in the output + expect(result.html).toContain('language-html'); + expect(result.html).not.toContain('href="/blog/my-post#section"'); + }); + }); + describe('baseUrl edge cases', () => { it('should work correctly when baseUrl has no trailing slash', () => { const baseUrlNoSlash = 'https://example.com/blog/my-post'; @@ -1049,7 +1329,7 @@ title: Test ![Alt](image.png) `; - const parser = new JekyllMarkdownParser(baseUrlNoSlash); + const parser = new JekyllMarkdownParser(baseUrlNoSlash, linkBasePath); const result = parser.parse(input); // Without trailing slash, path gets concatenated directly @@ -1065,11 +1345,288 @@ title: Test ![Alt](image.png) `; - const parser = new JekyllMarkdownParser(baseUrlWithSlash); + const parser = new JekyllMarkdownParser(baseUrlWithSlash, linkBasePath); const result = parser.parse(input); expect(result.html).toContain('src="https://example.com/blog/my-post/image.png"'); }); }); + + describe('Table of Contents (TOC) generation', () => { + it('should replace ${TOC_MARKER} marker with generated TOC', () => { + const input = `--- +title: Test +--- + +## Inhalt + +${TOC_MARKER} + +## Einleitung + +Text. + +## Fazit + +End. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + // TOC should contain links to headings after the marker + expect(result.html).toContain('href="/blog/my-post#einleitung"'); + expect(result.html).toContain('href="/blog/my-post#fazit"'); + // Should NOT contain the raw marker + expect(result.html).not.toContain('${TOC_MARKER}'); + }); + + it('should skip headings before ${TOC_MARKER} marker', () => { + const input = `--- +title: Test +--- + +## Inhalt + +${TOC_MARKER} + +## Hauptteil + +Text. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + // "Inhalt" heading should NOT be in the TOC links + expect(result.html).not.toContain('>Inhalt'); + // But "Hauptteil" should be in TOC + expect(result.html).toContain('href="/blog/my-post#hauptteil"'); + }); + + it('should include h2 and h3 headings with proper nesting', () => { + const input = `--- +title: Test +--- + +## Inhalt + +${TOC_MARKER} + +## Kapitel 1 + +Text. + +### Unterkapitel 1.1 + +More text. + +## Kapitel 2 + +End. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="/blog/my-post#kapitel-1"'); + expect(result.html).toContain('href="/blog/my-post#unterkapitel-11"'); + expect(result.html).toContain('href="/blog/my-post#kapitel-2"'); + }); + + it('should handle special characters in headings', () => { + const input = `--- +title: Test +--- + +## Inhalt + +${TOC_MARKER} + +## FAQ & Hilfe + +Text. + +## Über uns + +More. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('href="/blog/my-post#faq--hilfe"'); + // Note: marked URL-encodes non-ASCII chars in hrefs, but browser handles both + expect(result.html).toContain('href="/blog/my-post#%C3%BCber-uns"'); + // The link text should contain the original characters (HTML-escaped) + expect(result.html).toContain('>FAQ & Hilfe'); + expect(result.html).toContain('>Über uns'); + }); + + it('should work without ${TOC_MARKER} marker (no changes)', () => { + const input = `--- +title: Test +--- + +## Heading + +Text. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + expect(result.html).toContain('

Heading

'); + expect(result.html).not.toContain('${TOC_MARKER}'); + }); + + it('should generate empty TOC when no headings after marker', () => { + const input = `--- +title: Test +--- + +## Inhalt + +${TOC_MARKER} + +Just text, no more headings. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + // Should not contain the marker + expect(result.html).not.toContain('${TOC_MARKER}'); + // TOC area should be essentially empty (just the Inhalt heading) + expect(result.html).toContain('

Inhalt

'); + }); + + it('should preserve inline code formatting in TOC links', () => { + const input = `--- +title: Test +--- + +## Inhalt + +${TOC_MARKER} + +## Using \`npm install\` + +Text. + +## The \`async\` Keyword + +More text. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + // TOC links should contain tags (rendered from markdown) + expect(result.html).toContain('npm install'); + expect(result.html).toContain('async'); + // The actual headings should also have code formatting + expect(result.html).toContain('

Using npm install

'); + }); + + it('should preserve bold and italic formatting in TOC links', () => { + const input = `--- +title: Test +--- + +## Inhalt + +${TOC_MARKER} + +## This is **important** + +Text. + +## Use *caution* here + +More text. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + // TOC links should contain formatting tags + expect(result.html).toContain('important'); + expect(result.html).toContain('caution here'); + }); + + it('should preserve mixed formatting in TOC links', () => { + const input = `--- +title: Test +--- + +## Inhalt + +${TOC_MARKER} + +## Using \`rxResource\` with **Signals** + +Complex example. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + // Should have both code and bold formatting + expect(result.html).toContain('rxResource'); + expect(result.html).toContain('Signals'); + // Verify the complete link structure + expect(result.html).toContain('Using rxResource with Signals'); + }); + + it('should handle headings with only code (no plain text)', () => { + const input = `--- +title: Test +--- + +## Inhalt + +${TOC_MARKER} + +## \`package.json\` + +Config file. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + const result = parser.parse(input); + + // The entire heading is code + expect(result.html).toContain('package.json'); + expect(result.html).toContain('id="packagejson"'); + }); + + it('should warn about duplicate headings (known limitation)', () => { + // KNOWN LIMITATION: If the same heading text appears multiple times, + // TOC links may not work correctly due to ID suffix mismatch. + // We warn about this but don't fix it (very rare edge case). + const input = `--- +title: Test +--- + +## Inhalt + +${TOC_MARKER} + +## Fazit + +Text. + +## Fazit + +End. +`; + const parser = new JekyllMarkdownParser(baseUrl, linkBasePath); + + // Capture console.warn + const warnings: string[] = []; + const originalWarn = console.warn; + console.warn = (msg: string) => warnings.push(msg); + + parser.parse(input); + + console.warn = originalWarn; + + // Should warn about duplicate heading + expect(warnings.length).toBe(1); + expect(warnings[0]).toContain('Duplicate heading'); + expect(warnings[0]).toContain('Fazit'); + }); + }); }); }); diff --git a/shared/jekyll-markdown-parser.ts b/shared/jekyll-markdown-parser.ts index 0ac21b1..4a29c00 100644 --- a/shared/jekyll-markdown-parser.ts +++ b/shared/jekyll-markdown-parser.ts @@ -1,11 +1,28 @@ +import { posix as path } from 'path'; import { load } from 'js-yaml'; import { Marked, Renderer, Tokens } from 'marked'; import { markedHighlight } from 'marked-highlight'; -import { gfmHeadingId } from 'marked-gfm-heading-id'; +import { gfmHeadingId, getHeadingList, resetHeadings } from './gfm-heading-id'; import hljs from 'highlight.js'; +import { escapeHtml } from './html.utils'; +// Precompiled regexes for performance +const PROTOCOL_REGEX = /^\w+:/; +const IMG_SRC_REGEX = /]*)\ssrc=(["'])([^"']+)\2/g; +const ANCHOR_HREF_REGEX = /]*)\shref=(["'])([^"']+)\2/g; + +/** + * Placeholder for image base URL. Replaced at runtime by the Angular app. + * See "URL TRANSFORMATION SYSTEM" below for details. + */ export const MARKDOWN_BASE_URL_PLACEHOLDER = '%%MARKDOWN_BASE_URL%%'; +/** + * Marker for automatic table of contents generation. + * Place [[toc]] in your markdown and it will be replaced with a generated TOC. + */ +export const TOC_MARKER = '[[toc]]'; + /** * ============================================================================ * MODIFIED PARSER - Based on bouzuya/jekyll-markdown-parser @@ -14,8 +31,52 @@ export const MARKDOWN_BASE_URL_PLACEHOLDER = '%%MARKDOWN_BASE_URL%%'; * Original source: https://github.com/bouzuya/jekyll-markdown-parser * Repository archived on Jun 28, 2020 (read-only, no longer maintained) * - * SECURITY NOTE: - * -------------- + * ============================================================================ + * URL TRANSFORMATION SYSTEM + * ============================================================================ + * + * This parser handles two types of URL transformations: + * + * 1. IMAGES (baseUrl with MARKDOWN_BASE_URL_PLACEHOLDER) + * ------------------------------------------------------- + * Images use a placeholder that gets replaced at runtime by the Angular app. + * This allows serving images from different origins (CDN, local dev, etc.). + * + * Markdown: ![Alt](image.png) + * Build: + * Runtime: + * + * The placeholder is replaced in the Angular app based on environment config. + * This decouples the build from the deployment target. + * + * 2. LINKS (linkBasePath for relative → absolute transformation) + * --------------------------------------------------------------- + * Links are transformed from relative paths to absolute paths at build time. + * This is necessary because Angular uses which breaks + * relative anchor links (e.g., #section would navigate to /#section). + * + * Markdown: [Section](#section) + * Build: + * + * Markdown: [Other Post](../other-slug) + * Build: + * + * Markdown: [Other Section](../other-slug#intro) + * Build: + * + * The linkBasePath is derived from the folder structure: + * blog/my-slug/README.md → linkBasePath = "/blog/my-slug" + * + * WHY TWO DIFFERENT APPROACHES? + * - Images: Need runtime flexibility (CDN on prod, proxy during development) + * - Links: The Angular website mimics the folder structure of this repo. + * blog/ content is served at /blog/, material/ at /material/. + * That's why build-time resolution works: folder path = URL path. + * + * ============================================================================ + * SECURITY NOTE + * ============================================================================ + * * This parser does NOT sanitize or escape HTML content. Raw HTML in markdown * is passed through intentionally. This is a FEATURE, not a bug. * @@ -24,8 +85,10 @@ export const MARKDOWN_BASE_URL_PLACEHOLDER = '%%MARKDOWN_BASE_URL%%'; * All markdown content comes from our own Git repository. There is no * user-generated content. XSS is not a concern in this context. * - * CHANGES FROM ORIGINAL: - * ----------------------- + * ============================================================================ + * CHANGES FROM ORIGINAL + * ============================================================================ + * * 1. BUG FIX: Regex in separate() had typo `/^---s*$/` instead of `/^---\s*$/`. * This bug exists in the original bouzuya source code (never fixed). * The literal `s*` matches zero or more 's' characters, not whitespace. @@ -37,21 +100,34 @@ export const MARKDOWN_BASE_URL_PLACEHOLDER = '%%MARKDOWN_BASE_URL%%'; * 3. FEATURE: Added transformRelativeImagePaths() to handle raw HTML * tags that bypass the markdown renderer. * - * 4. CHANGE: Converted from CommonJS module to ES6 class with constructor - * for baseUrl injection. + * 4. FEATURE: Added transformRelativeLinks() to convert relative links to + * absolute paths, fixing issues in Angular. + * + * 5. CHANGE: Converted from CommonJS module to ES6 class with constructor + * for baseUrl and linkBasePath injection. * - * 5. UPGRADE: marked v4 → v17 migration + * 6. UPGRADE: marked v4 → v17 migration * - Using Marked class instance instead of global marked * - marked-highlight extension for syntax highlighting - * - marked-gfm-heading-id extension for heading IDs + * - Custom gfm-heading-id fork for heading IDs (see ./gfm-heading-id/) * - Token-based renderer API (token object instead of separate params) + * + * 7. REFACTOR: Shared utilities extracted to ./html.utils.ts + * - escapeHtml, decodeHtmlEntities, stripHtmlTags * ============================================================================ */ export class JekyllMarkdownParser { private marked: Marked; - constructor(private baseUrl: string) { + /** + * @param baseUrl - Base URL for images (e.g., '%%MARKDOWN_BASE_URL%%/blog/my-slug/') + * @param linkBasePath - Absolute path for links (e.g., '/blog/my-slug') + */ + constructor( + private baseUrl: string, + private linkBasePath: string + ) { this.marked = this.createMarkedInstance(); } @@ -70,12 +146,17 @@ export class JekyllMarkdownParser { /** * Check if a URL is absolute (should not be transformed). - * Absolute URLs include: https://, http://, data:, //, assets/, / + * Matches: protocols (mailto:, tel:, https:, etc.), protocol-relative (//), + * absolute paths (/), asset paths, and placeholder URLs. */ private isAbsoluteUrl(url: string): boolean { - return url.startsWith('https://') || url.startsWith('http://') || - url.startsWith('data:') || url.startsWith('//') || - url.startsWith('assets/') || url.startsWith('/') || + // Protocol pattern: word characters followed by colon (mailto:, tel:, https:, http:, ftp:, data:, etc.) + if (PROTOCOL_REGEX.test(url)) { + return true; + } + return url.startsWith('//') || + url.startsWith('/') || + url.startsWith('assets/') || url.startsWith(MARKDOWN_BASE_URL_PLACEHOLDER); } @@ -87,14 +168,49 @@ export class JekyllMarkdownParser { } /** - * Escape special HTML characters in attribute values. + * Generate a table of contents as Markdown from the document's headings. + * Only includes headings that appear AFTER the [[toc]] marker. + * + * @param markdown - The markdown content to extract headings from + * @returns Markdown list with links to headings, or empty string if no headings */ - private escapeHtml(text: string): string { - return text - .replace(/&/g, '&') - .replace(/"/g, '"') - .replace(//g, '>'); + private generateToc(markdown: string): string { + // Split at marker - only parse content AFTER the marker + const parts = markdown.split(TOC_MARKER); + if (parts.length < 2) { + return ''; + } + + const contentAfterMarker = parts.slice(1).join(TOC_MARKER); // Handle multiple markers (edge case) + + // Parse only the part after [[toc]] to collect headings + resetHeadings(); + this.marked.parse(contentAfterMarker); + const headings = getHeadingList(); + + // Filter to h2, h3, and h4 + const relevantHeadings = headings.filter(h => h.level >= 2 && h.level <= 4); + + if (relevantHeadings.length === 0) { + return ''; + } + + // Warn about duplicate headings (would cause ID mismatch if also before marker) + const seenRaw = new Set(); + for (const h of relevantHeadings) { + if (seenRaw.has(h.raw)) { + console.warn(`WARNING: Duplicate heading "${h.raw}" - TOC links may not work correctly`); + } + seenRaw.add(h.raw); + } + + // Generate markdown list + return relevantHeadings + .map(h => { + const indent = ' '.repeat(h.level - 2); // h2='', h3=' ', h4=' ' + return `${indent}* [${h.text}](#${h.id})`; + }) + .join('\n'); } /** @@ -115,10 +231,10 @@ export class JekyllMarkdownParser { src = this.baseUrl + this.normalizeRelativeUrl(token.href); } - const escapedAlt = this.escapeHtml(token.text); + const escapedAlt = escapeHtml(token.text); if (token.title) { - const escapedTitle = this.escapeHtml(token.title); + const escapedTitle = escapeHtml(token.title); const imgTag = `${escapedAlt}`; return `
${imgTag}
${escapedTitle}
`; } @@ -129,7 +245,7 @@ export class JekyllMarkdownParser { // Transform relative paths in raw HTML tags to absolute URLs // Supports both double quotes (src="...") and single quotes (src='...') private transformRelativeImagePaths(html: string): string { - return html.replace(/]*)\ssrc=(["'])([^"']+)\2/g, (match, attrs, quote, src) => { + return html.replace(IMG_SRC_REGEX, (match, attrs, quote, src) => { if (this.isAbsoluteUrl(src)) { return match; } @@ -137,6 +253,33 @@ export class JekyllMarkdownParser { }); } + /** + * Transform relative links to absolute paths. + * Fixes issue where #anchor resolves to /#anchor. + * + * Uses path.posix.resolve() for proper relative path resolution: + * - #section → /blog/my-slug#section + * - ../other-slug → /blog/other-slug + * - ../other-slug#section → /blog/other-slug#section + */ + private transformRelativeLinks(html: string): string { + return html.replace(ANCHOR_HREF_REGEX, (match, attrs, quote, href) => { + if (this.isAbsoluteUrl(href)) { + return match; + } + + const hasHash = href.includes('#'); + const [pathPart, hash] = hasHash ? href.split('#') : [href, '']; + + const resolved = pathPart + ? path.resolve(this.linkBasePath + '/', pathPart) + : this.linkBasePath; + + const newHref = hasHash ? resolved + '#' + hash : resolved; + return ` h.id); + const withImages = this.transformRelativeImagePaths(html); + const finalHtml = this.transformRelativeLinks(withImages); + return { html: finalHtml, headingIds }; } private parseYaml(yaml: string): Record { @@ -178,19 +336,13 @@ export class JekyllMarkdownParser { public parse(jekyllMarkdown: string): { html: string; - yaml: string; parsedYaml: Record; - markdown: string; + headingIds: string[]; } { const { yaml, markdown } = this.separate(jekyllMarkdown); const parsedYaml = this.parseYaml(yaml); - const html = this.compileMarkdown(markdown); - - return { - html, - markdown, - parsedYaml, - yaml - }; + const { html, headingIds } = this.compileMarkdown(markdown); + + return { html, parsedYaml, headingIds }; } } diff --git a/shared/link-validator.spec.ts b/shared/link-validator.spec.ts new file mode 100644 index 0000000..4664d81 --- /dev/null +++ b/shared/link-validator.spec.ts @@ -0,0 +1,218 @@ +import { describe, it, expect, beforeEach } from 'vitest'; +import { + registerAnchors, + registerLinks, + validateLinks, + resetValidator, + getAnchors, + getLinks +} from './link-validator'; + +describe('link-validator', () => { + beforeEach(() => { + resetValidator(); + }); + + describe('registerAnchors', () => { + it('should register anchors for a path', () => { + registerAnchors('/blog/my-post', ['intro', 'fazit']); + + const anchors = getAnchors('/blog/my-post'); + expect(anchors).toBeDefined(); + expect(anchors!.has('intro')).toBe(true); + expect(anchors!.has('fazit')).toBe(true); + }); + + it('should accumulate anchors for same path', () => { + registerAnchors('/blog/my-post', ['intro']); + registerAnchors('/blog/my-post', ['fazit']); + + const anchors = getAnchors('/blog/my-post'); + expect(anchors!.size).toBe(2); + }); + + it('should keep anchors separate per path', () => { + registerAnchors('/blog/post-1', ['intro']); + registerAnchors('/blog/post-2', ['fazit']); + + expect(getAnchors('/blog/post-1')!.has('intro')).toBe(true); + expect(getAnchors('/blog/post-1')!.has('fazit')).toBe(false); + expect(getAnchors('/blog/post-2')!.has('fazit')).toBe(true); + }); + }); + + describe('registerLinks', () => { + it('should extract anchor links from HTML', () => { + const html = '
Link'; + registerLinks('/blog/my-post', html); + + const links = getLinks(); + expect(links).toHaveLength(1); + expect(links[0]).toEqual({ + fromPath: '/blog/my-post', + toPath: '/blog/other', + anchor: 'section', + fullLink: '/blog/other#section' + }); + }); + + it('should handle same-document anchors', () => { + const html = 'Link'; + registerLinks('/blog/my-post', html); + + const links = getLinks(); + expect(links[0].toPath).toBe('/blog/my-post'); + expect(links[0].anchor).toBe('local-section'); + }); + + it('should extract multiple links', () => { + const html = ` + One + Two + Three + `; + registerLinks('/blog/my-post', html); + + expect(getLinks()).toHaveLength(3); + }); + + it('should ignore links without anchors', () => { + const html = 'No anchor'; + registerLinks('/blog/my-post', html); + + expect(getLinks()).toHaveLength(0); + }); + + it('should handle both quote styles', () => { + const html = ` + Double + Single + `; + registerLinks('/blog/my-post', html); + + expect(getLinks()).toHaveLength(2); + }); + + it('should skip external https links', () => { + const html = 'External'; + registerLinks('/blog/my-post', html); + + expect(getLinks()).toHaveLength(0); + }); + + it('should skip external http links', () => { + const html = 'External'; + registerLinks('/blog/my-post', html); + + expect(getLinks()).toHaveLength(0); + }); + + it('should skip protocol-relative links', () => { + const html = 'External'; + registerLinks('/blog/my-post', html); + + expect(getLinks()).toHaveLength(0); + }); + + it('should skip mailto links', () => { + const html = 'Mail'; + registerLinks('/blog/my-post', html); + + expect(getLinks()).toHaveLength(0); + }); + + it('should URL-decode anchor (ä encoded as %C3%A4)', () => { + // %C3%BC = ü in UTF-8 + const html = 'Link'; + registerLinks('/blog/my-post', html); + + // Anchor should be decoded to Unicode + expect(getLinks()[0].anchor).toBe('über-uns'); + }); + + it('should handle already decoded Unicode anchors', () => { + const html = 'Link'; + registerLinks('/blog/my-post', html); + + expect(getLinks()[0].anchor).toBe('grundsätzliches-zu-docker'); + }); + }); + + describe('validateLinks', () => { + it('should return valid for matching links', () => { + registerAnchors('/blog/post-1', ['intro', 'fazit']); + registerAnchors('/blog/post-2', ['overview']); + + const html = ` + Intro + Overview + `; + registerLinks('/blog/post-1', html); + + const result = validateLinks(); + expect(result.valid).toBe(true); + expect(result.brokenLinks).toHaveLength(0); + }); + + it('should detect broken anchor in existing path', () => { + registerAnchors('/blog/post-1', ['intro']); + + const html = 'Broken'; + registerLinks('/blog/my-post', html); + + const result = validateLinks(); + expect(result.valid).toBe(false); + expect(result.brokenLinks).toHaveLength(1); + expect(result.brokenLinks[0].anchor).toBe('nonexistent'); + }); + + it('should detect link to nonexistent path', () => { + registerAnchors('/blog/post-1', ['intro']); + + const html = 'Broken'; + registerLinks('/blog/my-post', html); + + const result = validateLinks(); + expect(result.valid).toBe(false); + expect(result.brokenLinks).toHaveLength(1); + expect(result.brokenLinks[0].toPath).toBe('/blog/nonexistent'); + }); + + it('should validate same-document links', () => { + registerAnchors('/blog/my-post', ['existing']); + + const html = ` + Valid + Broken + `; + registerLinks('/blog/my-post', html); + + const result = validateLinks(); + expect(result.valid).toBe(false); + expect(result.brokenLinks).toHaveLength(1); + expect(result.brokenLinks[0].anchor).toBe('missing'); + }); + + it('should count total links', () => { + registerAnchors('/blog/post', ['a', 'b']); + + const html = 'ABC'; + registerLinks('/blog/post', html); + + const result = validateLinks(); + expect(result.totalLinks).toBe(3); + }); + }); + + describe('resetValidator', () => { + it('should clear all data', () => { + registerAnchors('/blog/post', ['intro']); + registerLinks('/blog/post', 'Link'); + + resetValidator(); + + expect(getAnchors('/blog/post')).toBeUndefined(); + expect(getLinks()).toHaveLength(0); + }); + }); +}); diff --git a/shared/link-validator.ts b/shared/link-validator.ts new file mode 100644 index 0000000..c0515e8 --- /dev/null +++ b/shared/link-validator.ts @@ -0,0 +1,184 @@ +/** + * Anchor Link Validator + * + * Validates that all internal anchor links point to existing headings. + * Runs after all entries are parsed to catch broken links at build time. + * + * Features: + * - Detects broken anchor links (missing target or missing anchor) + * - Suggests similar anchors using Levenshtein distance (typo detection) + * - Non-blocking: only warns, does not fail the build + * + * Usage: + * 1. registerAnchors(path, headingIds) - call after parsing each entry + * 2. registerLinks(path, html) - extracts and registers all anchor links + * 3. validateLinks() - call at end of build to check for broken links + */ + +import { findSimilar } from './string.utils'; + +/** Registry of all anchors per entry path */ +const anchorRegistry = new Map>(); + +/** Registry of all anchor links found: { fromPath, toPath, anchor } */ +interface AnchorLink { + fromPath: string; + toPath: string; + anchor: string; + fullLink: string; +} +const linkRegistry: AnchorLink[] = []; + +// Regex to find href attributes with anchors: href="/blog/slug#anchor" or href="#anchor" +const ANCHOR_LINK_REGEX = /]*\shref=(["'])([^"']*#[^"']+)\1/g; + +/** + * Extract all anchor links from HTML using matchAll(). + * Safer than exec() loop with global regex - no shared state issues. + */ +function extractAnchorLinks(html: string): Array<{ fullLink: string }> { + return [...html.matchAll(ANCHOR_LINK_REGEX)].map(match => ({ + fullLink: match[2] + })); +} + +/** + * Register heading anchors for an entry. + * @param entryPath - Absolute path like "/blog/my-post" + * @param headingIds - Array of heading IDs like ["intro", "fazit"] + */ +export function registerAnchors(entryPath: string, headingIds: string[]): void { + const existing = anchorRegistry.get(entryPath) ?? new Set(); + for (const id of headingIds) { + existing.add(id); + } + anchorRegistry.set(entryPath, existing); +} + +/** + * Check if a URL is external (should not be validated). + */ +function isExternalUrl(url: string): boolean { + return /^https?:\/\//.test(url) || + url.startsWith('//') || + url.startsWith('mailto:') || + url.startsWith('tel:'); +} + +/** + * Extract anchor links from HTML and register them. + * Only registers internal links - external URLs are skipped. + * @param fromPath - Entry path where links were found + * @param html - HTML content to scan for links + */ +export function registerLinks(fromPath: string, html: string): void { + for (const { fullLink } of extractAnchorLinks(html)) { + // Skip external URLs + if (isExternalUrl(fullLink)) continue; + + // Parse the link: "/blog/other#section" or "#section" + const hashIndex = fullLink.indexOf('#'); + if (hashIndex === -1) continue; + + const pathPart = fullLink.substring(0, hashIndex); + // URL-decode anchor (marked encodes special chars like ä → %C3%A4) + const anchor = decodeURIComponent(fullLink.substring(hashIndex + 1)); + + // Determine target path + const toPath = pathPart || fromPath; // Empty path = same document + + linkRegistry.push({ + fromPath, + toPath, + anchor, + fullLink + }); + } +} + +/** + * Validate all registered links against registered anchors. + * @returns Object with broken links and stats + */ +export function validateLinks(): { + valid: boolean; + totalLinks: number; + brokenLinks: AnchorLink[]; +} { + const brokenLinks: AnchorLink[] = []; + + for (const link of linkRegistry) { + const targetAnchors = anchorRegistry.get(link.toPath); + + if (!targetAnchors) { + // Target entry doesn't exist + brokenLinks.push(link); + } else if (!targetAnchors.has(link.anchor)) { + // Anchor doesn't exist in target entry + brokenLinks.push(link); + } + } + + return { + valid: brokenLinks.length === 0, + totalLinks: linkRegistry.length, + brokenLinks + }; +} + +/** + * Print validation results to console. + * @returns true if all links are valid, false if there are broken links + */ +export function printValidationResults(): boolean { + const { valid, totalLinks, brokenLinks } = validateLinks(); + + if (valid) { + console.log(`✓ All ${totalLinks} anchor links are valid`); + return true; + } + + console.warn(`\n⚠️ Found ${brokenLinks.length} broken anchor link(s):\n`); + for (const link of brokenLinks) { + console.warn(` ${link.fromPath}`); + console.warn(` → ${link.fullLink}`); + + // Provide helpful context + const targetAnchors = anchorRegistry.get(link.toPath); + if (!targetAnchors) { + console.warn(` ✗ Target path "${link.toPath}" does not exist`); + } else { + console.warn(` ✗ Anchor "#${link.anchor}" not found`); + // Suggest similar anchors using fuzzy matching (Levenshtein distance ≤ 3) + const similar = findSimilar(link.anchor, [...targetAnchors], 3); + if (similar.length > 0) { + console.warn(` ? Did you mean: ${similar.slice(0, 3).map(a => '#' + a).join(', ')}`); + } + } + console.warn(''); + } + + return false; +} + +/** + * Reset the validator (for testing). + */ +export function resetValidator(): void { + anchorRegistry.clear(); + linkRegistry.length = 0; +} + +/** + * Get registered anchors for a path (for testing). + */ +export function getAnchors(entryPath: string): Set | undefined { + return anchorRegistry.get(entryPath); +} + +/** + * Get all registered links (for testing). + */ +export function getLinks(): AnchorLink[] { + return [...linkRegistry]; +} diff --git a/shared/list.utils.ts b/shared/list.utils.ts index b1bac4c..f42e1e6 100644 --- a/shared/list.utils.ts +++ b/shared/list.utils.ts @@ -1,4 +1,5 @@ import { EntryBase } from './base.types'; +import { stripHtmlTags } from './html.utils'; /** * Extract the first "big" paragraph from HTML content. @@ -17,7 +18,6 @@ export function extractFirstBigParagraph(html: string): string { return ''; } - const stripHtmlTags = (s: string) => s.replace(/<[^>]*>/g, ''); const bigParagraph = matches.find(m => m && stripHtmlTags(m).length > 100); const paragraph = bigParagraph || matches[0] || ''; return paragraph.replace(/(.*?)<\/a>/g, '$1'); diff --git a/shared/string.utils.spec.ts b/shared/string.utils.spec.ts new file mode 100644 index 0000000..c95a451 --- /dev/null +++ b/shared/string.utils.spec.ts @@ -0,0 +1,193 @@ +import { describe, it, expect } from 'vitest'; +import { levenshtein, findSimilar } from './string.utils'; + +describe('levenshtein', () => { + describe('identical strings', () => { + it('should return 0 for empty strings', () => { + expect(levenshtein('', '')).toBe(0); + }); + + it('should return 0 for identical strings', () => { + expect(levenshtein('hello', 'hello')).toBe(0); + expect(levenshtein('introduction', 'introduction')).toBe(0); + }); + }); + + describe('empty string cases', () => { + it('should return length of other string when one is empty', () => { + expect(levenshtein('', 'abc')).toBe(3); + expect(levenshtein('hello', '')).toBe(5); + }); + }); + + describe('single character edits', () => { + it('should detect single insertion', () => { + expect(levenshtein('ac', 'abc')).toBe(1); + expect(levenshtein('hell', 'hello')).toBe(1); + }); + + it('should detect single deletion', () => { + expect(levenshtein('abc', 'ac')).toBe(1); + expect(levenshtein('hello', 'helo')).toBe(1); + }); + + it('should detect single substitution', () => { + expect(levenshtein('abc', 'adc')).toBe(1); + expect(levenshtein('cat', 'bat')).toBe(1); + }); + }); + + describe('multiple edits', () => { + it('should count transposition as 2 edits', () => { + // Levenshtein counts transposition as 2 edits (delete + insert) + expect(levenshtein('ab', 'ba')).toBe(2); + expect(levenshtein('intro', 'intor')).toBe(2); + }); + + it('should handle classic example: kitten → sitting', () => { + // kitten → sitten (substitute k→s) + // sitten → sittin (substitute e→i) + // sittin → sitting (insert g) + expect(levenshtein('kitten', 'sitting')).toBe(3); + }); + + it('should handle complete replacement', () => { + expect(levenshtein('abc', 'xyz')).toBe(3); + }); + }); + + describe('real-world anchor examples', () => { + it('should detect typo: fazti → fazit', () => { + expect(levenshtein('fazti', 'fazit')).toBe(2); + }); + + it('should detect missing letter: instalation → installation', () => { + expect(levenshtein('instalation', 'installation')).toBe(1); + }); + + it('should detect extra letter: intrroduction → introduction', () => { + expect(levenshtein('intrroduction', 'introduction')).toBe(1); + }); + + it('should detect wrong letter: getting-startet → getting-started', () => { + expect(levenshtein('getting-startet', 'getting-started')).toBe(1); + }); + + it('should handle German umlauts', () => { + expect(levenshtein('über-uns', 'uber-uns')).toBe(1); + expect(levenshtein('übersicht', 'übersicht')).toBe(0); + }); + }); + + describe('symmetry', () => { + it('should be symmetric: d(a,b) = d(b,a)', () => { + expect(levenshtein('abc', 'def')).toBe(levenshtein('def', 'abc')); + expect(levenshtein('hello', 'hallo')).toBe(levenshtein('hallo', 'hello')); + expect(levenshtein('short', 'muchlonger')).toBe(levenshtein('muchlonger', 'short')); + }); + }); + + describe('triangle inequality', () => { + it('should satisfy: d(a,c) ≤ d(a,b) + d(b,c)', () => { + const a = 'abc'; + const b = 'abd'; + const c = 'acd'; + const dAB = levenshtein(a, b); + const dBC = levenshtein(b, c); + const dAC = levenshtein(a, c); + expect(dAC).toBeLessThanOrEqual(dAB + dBC); + }); + }); +}); + +describe('findSimilar', () => { + const candidates = [ + 'introduction', + 'getting-started', + 'installation', + 'configuration', + 'conclusion', + 'fazit', + 'über-uns' + ]; + + describe('typo detection', () => { + it('should find similar for typo: intrduction → introduction', () => { + const result = findSimilar('intrduction', candidates); + expect(result).toContain('introduction'); + }); + + it('should find similar for typo: instalation → installation', () => { + const result = findSimilar('instalation', candidates); + expect(result).toContain('installation'); + }); + + it('should find similar for typo: fazti → fazit', () => { + const result = findSimilar('fazti', candidates); + expect(result).toContain('fazit'); + }); + }); + + describe('sorting by distance', () => { + it('should return results sorted by distance (most similar first)', () => { + // 'intro' has distance 7 to 'introduction' and higher to others + const testCandidates = ['abc', 'ab', 'abcd', 'abcde']; + const result = findSimilar('abc', testCandidates, 5); + + // ab=1, abcd=1, abcde=2 (abc is exact match, excluded) + expect(result[0]).toBe('ab'); + // ab and abcd both have distance 1, order may vary + expect(result).toContain('abcd'); + }); + }); + + describe('maxDistance threshold', () => { + it('should respect maxDistance parameter', () => { + const result = findSimilar('xyz', candidates, 2); + // All candidates are far from 'xyz', none within distance 2 + expect(result).toHaveLength(0); + }); + + it('should include matches at exactly maxDistance', () => { + // 'fazit' → 'fazti' has distance 2 + const result = findSimilar('fazti', ['fazit'], 2); + expect(result).toContain('fazit'); + }); + + it('should exclude matches beyond maxDistance', () => { + const result = findSimilar('fazti', ['fazit'], 1); + expect(result).not.toContain('fazit'); + }); + }); + + describe('exact matches', () => { + it('should not include exact matches (not useful as suggestions)', () => { + const result = findSimilar('fazit', candidates); + expect(result).not.toContain('fazit'); + }); + }); + + describe('empty cases', () => { + it('should return empty array for empty candidates', () => { + const result = findSimilar('test', []); + expect(result).toHaveLength(0); + }); + + it('should return empty array when nothing is similar', () => { + const result = findSimilar('xyzabc123', candidates, 3); + expect(result).toHaveLength(0); + }); + }); + + describe('default maxDistance', () => { + it('should use default maxDistance of 3', () => { + // 'intro' to 'fazit' is distance 5, should not match with default + const result = findSimilar('intro', ['fazit']); + expect(result).toHaveLength(0); + + // 'fazi' to 'fazit' is distance 1, should match + const result2 = findSimilar('fazi', ['fazit']); + expect(result2).toContain('fazit'); + }); + }); +}); diff --git a/shared/string.utils.ts b/shared/string.utils.ts new file mode 100644 index 0000000..814da1f --- /dev/null +++ b/shared/string.utils.ts @@ -0,0 +1,113 @@ +/** + * String utility functions. + */ + +/** + * Calculate the Levenshtein distance between two strings. + * + * The Levenshtein distance is the minimum number of single-character edits + * (insertions, deletions, or substitutions) required to transform one string + * into another. + * + * @example + * ```typescript + * levenshtein('kitten', 'sitting'); // 3 (k→s, e→i, +g) + * levenshtein('intro', 'intor'); // 2 (transposition = 2 edits) + * levenshtein('hello', 'hello'); // 0 (identical) + * levenshtein('', 'abc'); // 3 (3 insertions) + * ``` + * + * Time complexity: O(m × n) where m = a.length, n = b.length + * Space complexity: O(min(m, n)) using single-row optimization + * + * @param a - First string + * @param b - Second string + * @returns The edit distance (0 = identical, higher = more different) + */ +export function levenshtein(a: string, b: string): number { + // Ensure a is the shorter string for space optimization + if (a.length > b.length) { + [a, b] = [b, a]; + } + + const m = a.length; + const n = b.length; + + // Edge cases + if (m === 0) return n; + if (n === 0) return m; + + // Single-row DP: dp[i] = distance for a[0..i-1] vs b[0..j-1] + // Initialize with distances for empty b (all insertions) + const dp: number[] = Array.from({ length: m + 1 }, (_, i) => i); + + for (let j = 1; j <= n; j++) { + let prev = dp[0]; // dp[i-1][j-1] from previous iteration + dp[0] = j; // Distance for empty a vs b[0..j-1] + + for (let i = 1; i <= m; i++) { + const temp = dp[i]; + + if (a[i - 1] === b[j - 1]) { + // Characters match: no edit needed + dp[i] = prev; + } else { + // Minimum of: substitute, delete, insert + dp[i] = 1 + Math.min( + prev, // substitute a[i-1] with b[j-1] + dp[i], // delete a[i-1] + dp[i - 1] // insert b[j-1] + ); + } + + prev = temp; + } + } + + return dp[m]; +} + +/** + * Find strings similar to a query using Levenshtein distance. + * + * Returns candidates sorted by similarity (most similar first). + * Only includes candidates within the maximum distance threshold. + * + * @example + * ```typescript + * const headings = ['introduction', 'getting-started', 'conclusion']; + * findSimilar('intrduction', headings, 3); + * // Returns: ['introduction'] (distance 1) + * + * findSimilar('start', headings, 10); + * // Returns: ['getting-started'] (distance 9, but "start" is substring) + * ``` + * + * @param query - The string to find matches for + * @param candidates - Array of strings to search in + * @param maxDistance - Maximum edit distance to consider (default: 3) + * @returns Array of similar strings, sorted by distance (ascending) + */ +export function findSimilar( + query: string, + candidates: string[], + maxDistance: number = 3 +): string[] { + const matches: Array<{ candidate: string; distance: number }> = []; + + for (const candidate of candidates) { + // Skip exact matches (not useful as suggestions) + if (candidate === query) continue; + + const distance = levenshtein(query, candidate); + + if (distance <= maxDistance) { + matches.push({ candidate, distance }); + } + } + + // Sort by distance (most similar first) + matches.sort((a, b) => a.distance - b.distance); + + return matches.map(m => m.candidate); +} diff --git a/shared/types.d.ts b/shared/types.d.ts deleted file mode 100644 index 057d8dd..0000000 --- a/shared/types.d.ts +++ /dev/null @@ -1,4 +0,0 @@ -declare module 'marked-gfm-heading-id' { - import type { MarkedExtension } from 'marked'; - export function gfmHeadingId(): MarkedExtension; -}