From 79eb628d28c3f6f3fe4e3e179830ce3aa4a8d7c0 Mon Sep 17 00:00:00 2001 From: Jorben Date: Mon, 26 Jan 2026 13:12:26 +0800 Subject: [PATCH 1/3] docs: add Office splitter design for Word, PowerPoint and Excel support --- docs/OFFICE_SPLITTER_DESIGN.md | 1147 ++++++++++++++++++++++++++++++++ 1 file changed, 1147 insertions(+) create mode 100644 docs/OFFICE_SPLITTER_DESIGN.md diff --git a/docs/OFFICE_SPLITTER_DESIGN.md b/docs/OFFICE_SPLITTER_DESIGN.md new file mode 100644 index 0000000..58fb1b6 --- /dev/null +++ b/docs/OFFICE_SPLITTER_DESIGN.md @@ -0,0 +1,1147 @@ +# Office 文件支持扩展设计方案 + +## 概述 + +本文档描述如何扩展 MarkPDFdown Desktop 以支持 Word、PowerPoint 和 Excel 文件的转换。 + +### 目标 +- 支持 `word` 类型(.doc, .docx, .dot, .dotx) +- 支持 `powerpoint` 类型(.ppt, .pptx, .pot, .potx) +- 支持 `excel` 类型(.xls, .xlsx, .xlt, .xltx, .csv) +- 轻量级实现(包大小增加 < 4MB) +- 复用 Electron 渲染能力,无需额外浏览器进程 + +### 设计原则 +- 统一使用 `docType`(文档类型)进行分支判断,而非文件扩展名 +- 遵循现有清洁架构(ISplitter 接口 → SplitterFactory) +- 与现有 PDF/Image 分割器保持一致的 API + +--- + +## 架构设计 + +### 1. 文档类型定义 + +```typescript +// src/shared/types/DocType.ts +export enum DocType { + PDF = 'pdf', + IMAGE = 'image', + WORD = 'word', + POWERPOINT = 'powerpoint', + EXCEL = 'excel', +} + +// 扩展名到文档类型的映射 +export const EXTENSION_TO_DOCTYPE: Record = { + // PDF + 'pdf': DocType.PDF, + + // Image + 'jpg': DocType.IMAGE, + 'jpeg': DocType.IMAGE, + 'png': DocType.IMAGE, + 'webp': DocType.IMAGE, + + // Word + 'doc': DocType.WORD, + 'docx': DocType.WORD, + 'dot': DocType.WORD, + 'dotx': DocType.WORD, + + // PowerPoint + 'ppt': DocType.POWERPOINT, + 'pptx': DocType.POWERPOINT, + 'pot': DocType.POWERPOINT, + 'potx': DocType.POWERPOINT, + + // Excel + 'xls': DocType.EXCEL, + 'xlsx': DocType.EXCEL, + 'xlt': DocType.EXCEL, + 'xltx': DocType.EXCEL, + 'csv': DocType.EXCEL, +}; +``` + +### 2. 类图 + +``` +┌─────────────────────────────────────────────────────────────────────────┐ +│ Domain Layer │ +├─────────────────────────────────────────────────────────────────────────┤ +│ ┌───────────────────┐ ┌────────────────┐ │ +│ │ <> │ │ DocType │ │ +│ │ ISplitter │ │ (enum) │ │ +│ ├───────────────────┤ ├────────────────┤ │ +│ │ + split(task) │ │ PDF │ │ +│ │ + cleanup(taskId) │ │ IMAGE │ │ +│ └───────────────────┘ │ WORD │ │ +│ ▲ │ POWERPOINT │ │ +│ │ │ EXCEL │ │ +│ │ └────────────────┘ │ +├───────────┼─────────────────────────────────────────────────────────────┤ +│ │ Infrastructure Layer │ +├───────────┼─────────────────────────────────────────────────────────────┤ +│ ┌────────┴─────────┬──────────────────┬──────────────────┐ │ +│ │ │ │ │ │ +│ ▼ ▼ ▼ ▼ │ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────────────────────────┐ │ +│ │ PDFSplitter │ │ImageSplitter │ │ OfficeSplitter (NEW) │ │ +│ ├──────────────┤ ├──────────────┤ ├──────────────────────────────────┤ │ +│ │ pdf-to-png │ │ fs.copyFile │ │ - mammoth.js (Word → HTML) │ │ +│ │ pdf-lib │ │ │ │ - jszip (PPT → HTML) │ │ +│ └──────────────┘ └──────────────┘ │ - xlsx (Excel → HTML) │ │ +│ │ - BrowserWindow (HTML → PNG) │ │ +│ └──────────────────────────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────┐ │ +│ │ SplitterFactory (修改) │ │ +│ ├─────────────────────────────────────────────────────────────────┤ │ +│ │ + create(docType: DocType): ISplitter │ │ +│ │ + getDocType(filename: string): DocType │ │ +│ │ + createFromFilename(filename: string): ISplitter │ │ +│ └─────────────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────────────┘ +``` + +### 3. 处理流程 + +``` +┌──────────────────┐ ┌────────────────────┐ ┌─────────────────────┐ +│ 上传文件 │───▶│ SplitterFactory │───▶│ OfficeSplitter │ +│ .docx/.pptx/.xlsx│ │ getDocType() │ │ │ +└──────────────────┘ │ create(docType) │ └──────────┬──────────┘ + └────────────────────┘ │ + ▼ + ┌────────────────────────────────────────────────────┐ + │ split(task) │ + ├────────────────────────────────────────────────────┤ + │ 1. 根据 docType 选择解析策略 │ + │ - word: mammoth.js 转 HTML │ + │ - powerpoint: jszip 解析幻灯片 XML │ + │ - excel: xlsx 库解析工作表 │ + │ │ + │ 2. 渲染 HTML 为图片 │ + │ - 创建隐藏 BrowserWindow │ + │ - loadURL (data:text/html) │ + │ - capturePage() 截图 │ + │ │ + │ 3. 分页策略 │ + │ - word: 按内容高度分页(A4 比例) │ + │ - powerpoint: 每张幻灯片一页 │ + │ - excel: 按 Sheet 分页 + 智能尺寸计算 │ + └────────────────────────────────────────────────────┘ + │ + ▼ + ┌────────────────────────────────────────────────────┐ + │ SplitResult │ + │ { pages: PageInfo[], totalPages: number } │ + └────────────────────────────────────────────────────┘ +``` + +--- + +## 详细设计 + +### 1. SplitterFactory 改造 + +```typescript +// src/core/infrastructure/adapters/split/SplitterFactory.ts +import path from 'path'; +import { ISplitter } from '../../../domain/split/ISplitter.js'; +import { PDFSplitter } from './PDFSplitter.js'; +import { ImageSplitter } from './ImageSplitter.js'; +import { OfficeSplitter } from './OfficeSplitter.js'; +import { DocType, EXTENSION_TO_DOCTYPE } from '../../../../shared/types/DocType.js'; + +export class SplitterFactory { + private readonly uploadsDir: string; + + constructor(uploadsDir: string) { + this.uploadsDir = uploadsDir; + } + + /** + * 根据文档类型创建对应的分割器 + * + * @param docType - 文档类型(统一使用 DocType 枚举) + * @returns 对应的分割器实例 + */ + create(docType: DocType): ISplitter { + switch (docType) { + case DocType.PDF: + return new PDFSplitter(this.uploadsDir); + + case DocType.IMAGE: + return new ImageSplitter(this.uploadsDir); + + case DocType.WORD: + case DocType.POWERPOINT: + case DocType.EXCEL: + return new OfficeSplitter(this.uploadsDir, docType); + + default: + const supportedTypes = Object.values(DocType).join(', '); + throw new Error( + `Unsupported document type: ${docType}. Supported types: ${supportedTypes}` + ); + } + } + + /** + * 从文件名获取文档类型 + * + * @param filename - 文件名 + * @returns 文档类型 + */ + static getDocType(filename: string): DocType { + const ext = path.extname(filename); + if (!ext || ext === '.') { + throw new Error(`Filename has no extension: ${filename}`); + } + + const normalizedExt = ext.slice(1).toLowerCase(); + const docType = EXTENSION_TO_DOCTYPE[normalizedExt]; + + if (!docType) { + const supportedExts = Object.keys(EXTENSION_TO_DOCTYPE).join(', '); + throw new Error( + `Unsupported file extension: ${ext}. Supported extensions: ${supportedExts}` + ); + } + + return docType; + } + + /** + * 从文件名创建分割器(便捷方法) + */ + createFromFilename(filename: string): ISplitter { + const docType = SplitterFactory.getDocType(filename); + return this.create(docType); + } +} +``` + +### 2. OfficeSplitter 实现 + +```typescript +// src/core/infrastructure/adapters/split/OfficeSplitter.ts +import { promises as fs } from 'fs'; +import path from 'path'; +import mammoth from 'mammoth'; +import { BrowserWindow } from 'electron'; +import { ISplitter, SplitResult, PageInfo } from '../../../domain/split/ISplitter.js'; +import { Task } from '../../../../shared/types/index.js'; +import { DocType } from '../../../../shared/types/DocType.js'; +import { ImagePathUtil } from './ImagePathUtil.js'; +import { WORKER_CONFIG } from '../../config/worker.config.js'; + +/** + * 页面配置常量 + */ +const PAGE_CONFIG = { + /** A4 页面宽度(像素,96 DPI) */ + PAGE_WIDTH: 794, + /** A4 页面高度(像素,96 DPI) */ + PAGE_HEIGHT: 1123, + /** PPT 幻灯片宽度 */ + SLIDE_WIDTH: 1280, + /** PPT 幻灯片高度(16:9) */ + SLIDE_HEIGHT: 720, + /** 渲染缩放因子 */ + DEVICE_SCALE_FACTOR: 2, +}; + +/** + * Excel 页面配置常量 + */ +const EXCEL_CONFIG = { + /** 最大渲染宽度 */ + MAX_WIDTH: 1600, + /** 最大渲染高度(单次截图) */ + MAX_HEIGHT: 2000, + /** 每列默认宽度 */ + DEFAULT_COL_WIDTH: 100, + /** 最小列宽 */ + MIN_COL_WIDTH: 60, + /** 行高 */ + ROW_HEIGHT: 28, + /** 最小页面宽度 */ + MIN_WIDTH: 800, +}; + +/** + * Office 文件分割器 + * + * 支持: + * - Word 文档:.doc, .docx, .dot, .dotx + * - PowerPoint 演示文稿:.ppt, .pptx, .pot, .potx + * - Excel 电子表格:.xls, .xlsx, .xlt, .xltx, .csv + * + * 技术方案: + * - 使用 mammoth.js 将 Word 文档转换为 HTML + * - 使用 jszip 解析 PowerPoint 文件 + * - 使用 xlsx (SheetJS) 解析 Excel 文件 + * - 复用 Electron BrowserWindow 进行 HTML → PNG 渲染 + */ +export class OfficeSplitter implements ISplitter { + private readonly uploadsDir: string; + private readonly docType: DocType; + + constructor(uploadsDir: string, docType: DocType) { + this.uploadsDir = uploadsDir; + this.docType = docType; + } + + /** + * 分割 Office 文件为页面图片 + */ + async split(task: Task): Promise { + if (!task.id) { + throw new Error('Task ID is required'); + } + if (!task.filename) { + throw new Error('Task filename is required'); + } + + const taskId = task.id; + const filename = task.filename; + const sourcePath = path.join(this.uploadsDir, taskId, filename); + + try { + // 确保源文件存在 + await fs.access(sourcePath); + + // 确保输出目录存在 + const taskDir = ImagePathUtil.getTaskDir(taskId); + await fs.mkdir(taskDir, { recursive: true }); + + // 根据文档类型选择处理策略 + let pages: PageInfo[]; + + switch (this.docType) { + case DocType.WORD: + pages = await this.splitWord(sourcePath, taskId); + break; + case DocType.POWERPOINT: + pages = await this.splitPowerPoint(sourcePath, taskId); + break; + case DocType.EXCEL: + pages = await this.splitExcel(sourcePath, taskId); + break; + default: + throw new Error(`OfficeSplitter does not support docType: ${this.docType}`); + } + + return { + pages, + totalPages: pages.length, + }; + } catch (error) { + throw this.wrapError(error, taskId, filename); + } + } + + /** + * 分割 Word 文档 + */ + private async splitWord(sourcePath: string, taskId: string): Promise { + // 使用 mammoth.js 将 docx 转换为 HTML + const result = await mammoth.convertToHtml({ path: sourcePath }); + const html = result.value; + + // 如果有警告,记录日志 + if (result.messages.length > 0) { + console.warn(`[OfficeSplitter] Word conversion warnings:`, result.messages); + } + + // 构建完整 HTML 页面 + const fullHtml = this.buildWordHtml(html); + + // 渲染为图片(按页分割) + return this.renderHtmlToPages(fullHtml, taskId, DocType.WORD); + } + + /** + * 分割 PowerPoint 演示文稿 + */ + private async splitPowerPoint(sourcePath: string, taskId: string): Promise { + // 解析 PPTX 文件 + const slides = await this.parsePptx(sourcePath); + + const pages: PageInfo[] = []; + + // 每张幻灯片单独渲染 + for (let i = 0; i < slides.length; i++) { + const slideHtml = this.buildSlideHtml(slides[i], i + 1); + const pageImages = await this.renderHtmlToPages(slideHtml, taskId, DocType.POWERPOINT, i + 1); + pages.push(...pageImages); + } + + return pages; + } + + /** + * 分割 Excel 电子表格 + * + * 策略:按 Sheet 分页 + 智能尺寸计算 + * - 每个 Sheet 独立渲染 + * - 根据列数动态计算宽度 + * - 超长内容自动垂直分页 + */ + private async splitExcel(sourcePath: string, taskId: string): Promise { + const XLSX = await import('xlsx'); + + // 读取 Excel 文件 + const workbook = XLSX.read(await fs.readFile(sourcePath), { type: 'buffer' }); + + if (workbook.SheetNames.length === 0) { + throw new Error('Excel file contains no sheets'); + } + + const pages: PageInfo[] = []; + let pageIndex = 0; + + // 遍历每个 Sheet + for (const sheetName of workbook.SheetNames) { + const worksheet = workbook.Sheets[sheetName]; + + // 获取数据范围 + const range = XLSX.utils.decode_range(worksheet['!ref'] || 'A1'); + const colCount = range.e.c - range.s.c + 1; + const rowCount = range.e.r - range.s.r + 1; + + // 计算渲染尺寸 + const dimensions = this.calculateExcelDimensions(colCount, rowCount); + + // 转换为 HTML 表格 + const htmlTable = XLSX.utils.sheet_to_html(worksheet, { + editable: false, + header: '', + footer: '', + }); + + // 构建完整 HTML + const fullHtml = this.buildExcelHtml(htmlTable, sheetName, dimensions.width); + + // 渲染为图片(可能需要垂直分页) + const sheetPages = await this.renderExcelToPages( + fullHtml, + taskId, + pageIndex, + dimensions + ); + + pages.push(...sheetPages); + pageIndex += sheetPages.length; + } + + return pages; + } + + /** + * 计算 Excel Sheet 的渲染尺寸 + */ + private calculateExcelDimensions( + colCount: number, + rowCount: number + ): { width: number; height: number; pages: number } { + // 计算内容宽度 + const contentWidth = Math.min( + colCount * EXCEL_CONFIG.DEFAULT_COL_WIDTH, + EXCEL_CONFIG.MAX_WIDTH + ); + + // 计算内容高度 + const contentHeight = rowCount * EXCEL_CONFIG.ROW_HEIGHT; + + // 计算需要多少页 + const pages = Math.ceil(contentHeight / EXCEL_CONFIG.MAX_HEIGHT); + + return { + width: Math.max(contentWidth, EXCEL_CONFIG.MIN_WIDTH), + height: Math.min(contentHeight, EXCEL_CONFIG.MAX_HEIGHT), + pages, + }; + } + + /** + * 构建 Excel 表格的完整 HTML + */ + private buildExcelHtml(tableHtml: string, sheetName: string, width: number): string { + return ` + + + + + + + +
📊 ${this.escapeHtml(sheetName)}
+ ${tableHtml} + +`; + } + + /** + * 渲染 Excel HTML 为页面图片 + * + * 处理超长表格的垂直分页 + */ + private async renderExcelToPages( + html: string, + taskId: string, + startPageIndex: number, + dimensions: { width: number; height: number; pages: number } + ): Promise { + const pages: PageInfo[] = []; + + // 创建隐藏的渲染窗口 + const renderWindow = new BrowserWindow({ + show: false, + width: dimensions.width, + height: dimensions.height, + webPreferences: { + nodeIntegration: false, + contextIsolation: true, + offscreen: true, + }, + }); + + try { + renderWindow.webContents.setZoomFactor(PAGE_CONFIG.DEVICE_SCALE_FACTOR); + + const dataUrl = `data:text/html;charset=utf-8,${encodeURIComponent(html)}`; + await renderWindow.loadURL(dataUrl); + await this.waitForRender(renderWindow); + + // 获取实际内容高度 + const totalHeight = await renderWindow.webContents.executeJavaScript( + 'document.body.scrollHeight' + ); + + const pageHeight = EXCEL_CONFIG.MAX_HEIGHT; + const totalPages = Math.ceil(totalHeight / pageHeight); + + for (let i = 0; i < totalPages; i++) { + const pageNum = startPageIndex + i + 1; + const imagePath = ImagePathUtil.getPath(taskId, pageNum); + + // 滚动到对应位置 + await renderWindow.webContents.executeJavaScript( + `window.scrollTo(0, ${i * pageHeight})` + ); + + await this.sleep(100); + + // 截图 + const captureHeight = Math.min(pageHeight, totalHeight - i * pageHeight); + const image = await renderWindow.webContents.capturePage({ + x: 0, + y: 0, + width: dimensions.width, + height: captureHeight, + }); + + await fs.writeFile(imagePath, image.toPNG()); + + pages.push({ + page: pageNum, + pageSource: pageNum, + imagePath, + }); + } + + return pages; + } finally { + renderWindow.destroy(); + } + } + + /** + * 解析 PPTX 文件 + * + * PPTX 是 ZIP 格式,包含 XML 文件 + */ + private async parsePptx(sourcePath: string): Promise { + const JSZip = (await import('jszip')).default; + + const data = await fs.readFile(sourcePath); + const zip = await JSZip.loadAsync(data); + + const slides: string[] = []; + + // PPTX 结构: ppt/slides/slide1.xml, slide2.xml, ... + const slideFiles = Object.keys(zip.files) + .filter(name => /^ppt\/slides\/slide\d+\.xml$/.test(name)) + .sort((a, b) => { + const numA = parseInt(a.match(/slide(\d+)/)?.[1] || '0'); + const numB = parseInt(b.match(/slide(\d+)/)?.[1] || '0'); + return numA - numB; + }); + + for (const slideFile of slideFiles) { + const content = await zip.file(slideFile)?.async('string'); + if (content) { + // 从 XML 提取文本内容 + const slideHtml = this.pptxXmlToHtml(content); + slides.push(slideHtml); + } + } + + if (slides.length === 0) { + throw new Error('PowerPoint file contains no slides'); + } + + return slides; + } + + /** + * 将 PPTX XML 转换为 HTML + */ + private pptxXmlToHtml(xml: string): string { + // 提取文本内容( 标签) + const textRegex = /]*>([^<]*)<\/a:t>/g; + const texts: string[] = []; + let match; + + while ((match = textRegex.exec(xml)) !== null) { + if (match[1].trim()) { + texts.push(match[1]); + } + } + + // 构建简单 HTML(保持段落结构) + return texts.map(text => `

${this.escapeHtml(text)}

`).join('\n'); + } + + /** + * 构建 Word 文档的完整 HTML + */ + private buildWordHtml(content: string): string { + return ` + + + + + + + + ${content} + +`; + } + + /** + * 构建单张幻灯片的 HTML + */ + private buildSlideHtml(content: string, slideNumber: number): string { + return ` + + + + + + + +
+ ${content} +
+
${slideNumber}
+ +`; + } + + /** + * 将 HTML 渲染为页面图片 + * + * 利用 Electron 的 BrowserWindow 进行渲染: + * 1. 创建隐藏的 BrowserWindow + * 2. 加载 HTML 内容 + * 3. 使用 capturePage() 截图 + * 4. 保存为 PNG 文件 + */ + private async renderHtmlToPages( + html: string, + taskId: string, + docType: DocType, + slideNumber?: number + ): Promise { + const pages: PageInfo[] = []; + + // 根据文档类型确定页面尺寸 + const isSlide = docType === DocType.POWERPOINT; + const pageWidth = isSlide ? PAGE_CONFIG.SLIDE_WIDTH : PAGE_CONFIG.PAGE_WIDTH; + const pageHeight = isSlide ? PAGE_CONFIG.SLIDE_HEIGHT : PAGE_CONFIG.PAGE_HEIGHT; + + // 创建隐藏的渲染窗口 + const renderWindow = new BrowserWindow({ + show: false, + width: pageWidth, + height: pageHeight, + webPreferences: { + nodeIntegration: false, + contextIsolation: true, + offscreen: true, + }, + }); + + try { + // 设置设备缩放因子以获得高清图片 + renderWindow.webContents.setZoomFactor(PAGE_CONFIG.DEVICE_SCALE_FACTOR); + + // 加载 HTML 内容 + const dataUrl = `data:text/html;charset=utf-8,${encodeURIComponent(html)}`; + await renderWindow.loadURL(dataUrl); + + // 等待页面渲染完成 + await this.waitForRender(renderWindow); + + if (isSlide) { + // PPT:每张幻灯片单独截图 + const pageNum = slideNumber || 1; + const imagePath = ImagePathUtil.getPath(taskId, pageNum); + + const image = await renderWindow.webContents.capturePage(); + await fs.writeFile(imagePath, image.toPNG()); + + pages.push({ + page: pageNum, + pageSource: pageNum, + imagePath, + }); + } else { + // Word:获取文档总高度,按页分割 + const totalHeight = await renderWindow.webContents.executeJavaScript( + 'document.body.scrollHeight' + ); + + const totalPages = Math.ceil(totalHeight / pageHeight); + + for (let i = 0; i < totalPages; i++) { + const pageNum = i + 1; + const imagePath = ImagePathUtil.getPath(taskId, pageNum); + + // 滚动到对应页面位置 + await renderWindow.webContents.executeJavaScript( + `window.scrollTo(0, ${i * pageHeight})` + ); + + // 等待滚动完成 + await this.sleep(100); + + // 截图 + const image = await renderWindow.webContents.capturePage({ + x: 0, + y: 0, + width: pageWidth, + height: pageHeight, + }); + + await fs.writeFile(imagePath, image.toPNG()); + + pages.push({ + page: pageNum, + pageSource: pageNum, + imagePath, + }); + } + } + + return pages; + } finally { + // 确保窗口被关闭 + renderWindow.destroy(); + } + } + + /** + * 等待页面渲染完成 + */ + private async waitForRender(window: BrowserWindow): Promise { + return new Promise((resolve) => { + window.webContents.on('did-finish-load', () => { + // 额外等待一小段时间确保样式渲染完成 + setTimeout(resolve, 200); + }); + }); + } + + /** + * HTML 转义 + */ + private escapeHtml(text: string): string { + return text + .replace(/&/g, '&') + .replace(//g, '>') + .replace(/"/g, '"') + .replace(/'/g, '''); + } + + /** + * 错误包装 + */ + private wrapError(error: unknown, _taskId: string, filename: string): Error { + const err = error as Error; + const message = err.message.toLowerCase(); + + if (message.includes('enoent') || message.includes('no such file')) { + return new Error( + `Office file not found: ${filename}. The file may have been moved or deleted.` + ); + } + + if (message.includes('corrupt') || message.includes('invalid')) { + return new Error( + `Office file appears to be corrupted: ${filename}. Please check the file.` + ); + } + + if (message.includes('password') || message.includes('encrypted')) { + return new Error( + `Cannot process password-protected file: ${filename}. Please provide an unencrypted version.` + ); + } + + return new Error(`Failed to process Office file ${filename}: ${err.message}`); + } + + /** + * 清理任务临时文件 + */ + async cleanup(taskId: string): Promise { + const taskDir = ImagePathUtil.getTaskDir(taskId); + + try { + await fs.rm(taskDir, { recursive: true, force: true }); + } catch (error) { + console.warn(`[OfficeSplitter] Failed to cleanup task ${taskId}:`, error); + } + } + + /** + * 延时函数 + */ + private sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); + } +} +``` + +--- + +## 依赖变更 + +### 新增依赖 + +```json +{ + "dependencies": { + "mammoth": "^1.6.0", + "xlsx": "^0.18.5" + } +} +``` + +| 依赖 | 用途 | 大小 | +|------|------|------| +| mammoth | Word 文档转 HTML | ~1.5MB | +| xlsx (SheetJS) | Excel 文件解析 | ~2MB | +| jszip | 解析 PPTX(已存在于项目中) | - | + +### 包大小影响 + +- **新增依赖大小**:约 3.5MB +- **无额外运行时依赖**:复用 Electron 内置能力 + +--- + +## 文件结构 + +``` +src/ +├── shared/ +│ └── types/ +│ └── DocType.ts # 新增:文档类型定义 +│ +└── core/ + └── infrastructure/ + └── adapters/ + └── split/ + ├── SplitterFactory.ts # 修改:支持 docType 分支 + ├── OfficeSplitter.ts # 新增:Office 文件分割器 + ├── PDFSplitter.ts # 保持不变 + └── ImageSplitter.ts # 保持不变 +``` + +--- + +## 对比现有实现 + +| 特性 | PDFSplitter | ImageSplitter | OfficeSplitter | +|------|-------------|---------------|----------------| +| 输入格式 | PDF | JPG/PNG/WebP | DOCX/PPTX/XLSX | +| 转换方式 | pdf-to-png | fs.copyFile | HTML → capturePage | +| 分页策略 | 原生页面 | 单页 | Word 按高度 / PPT 每幻灯片 / Excel 按 Sheet | +| 页码支持 | 支持 page_range | 忽略 | 暂不支持 | +| 重试机制 | 3 次重试 | 无 | 无(可扩展) | + +--- + +## Excel 分页策略详解 + +### 挑战 + +| 问题 | 说明 | +|------|------| +| **列数不固定** | 表格可能有 3 列或 100 列,宽度难以预设 | +| **行数不固定** | 可能几行或数万行 | +| **多 Sheet** | 一个 Excel 可能有多个工作表 | +| **合并单元格** | 复杂布局影响渲染 | + +### 采用方案:按 Sheet 分页 + 智能尺寸计算 + +``` +┌──────────────┐ ┌─────────────────┐ ┌────────────────────┐ +│ .xlsx 文件 │───▶│ xlsx 库解析 │───▶│ 获取 Sheet 列表 │ +└──────────────┘ └─────────────────┘ └─────────┬──────────┘ + │ + ┌────────────────────────────────┘ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ 遍历每个 Sheet │ +├─────────────────────────────────────────────────────────────────┤ +│ 1. 获取数据范围(行数、列数) │ +│ 2. 计算渲染尺寸 │ +│ - 宽度 = min(列数 × 100px, 1600px) │ +│ - 高度 = min(行数 × 28px, 2000px) │ +│ 3. 转换为 HTML 表格(使用 sheet_to_html) │ +│ 4. 设置 BrowserWindow 尺寸 │ +│ 5. 分页截图(如果内容超高) │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ 输出: [Sheet1-Page1.png, Sheet1-Page2.png, Sheet2-Page1.png] │ +└─────────────────────────────────────────────────────────────────┘ +``` + +### 尺寸计算公式 + +```typescript +// 宽度计算 +width = Math.max( + Math.min(colCount * 100, 1600), // 上限 1600px + 800 // 下限 800px +) + +// 高度计算(单次截图) +height = Math.min(rowCount * 28, 2000) // 上限 2000px + +// 总页数 +pages = Math.ceil(totalContentHeight / 2000) +``` + +--- + +## 使用示例 + +```typescript +// 使用文档类型创建分割器 +const factory = new SplitterFactory(uploadsDir); + +// 方式 1:直接使用 DocType +const wordSplitter = factory.create(DocType.WORD); +const pptSplitter = factory.create(DocType.POWERPOINT); +const excelSplitter = factory.create(DocType.EXCEL); + +// 方式 2:从文件名自动推断 +const splitter = factory.createFromFilename('report.xlsx'); +const result = await splitter.split(task); + +// 处理结果 +console.log(`Generated ${result.totalPages} pages`); +result.pages.forEach(page => { + console.log(`Page ${page.page}: ${page.imagePath}`); +}); +``` + +--- + +## 后续扩展 + +### 可选优化 + +1. **旧格式支持**:.doc、.ppt、.xls(非 XML 格式)需要额外库支持 +2. **图片提取**:从文档中提取嵌入图片 +3. **样式还原**:更精确的 CSS 样式映射 +4. **页码范围**:支持 page_range 参数 +5. **Excel 图表**:提取并渲染 Excel 图表 + +### 性能优化 + +1. **窗口复用**:多任务时复用 BrowserWindow +2. **并行渲染**:多页/多 Sheet 同时渲染 +3. **缓存机制**:相同文档的转换缓存 +4. **流式处理**:大文件分块读取 + +--- + +## 版本兼容性 + +- Node.js: ≥ 18.0.0 +- Electron: ≥ 28.0.0 +- mammoth: ≥ 1.6.0 +- xlsx: ≥ 0.18.5 From 45c723f5ffcabc5f72ec15494ff49101b92892b3 Mon Sep 17 00:00:00 2001 From: Jorben Date: Mon, 26 Jan 2026 14:50:15 +0800 Subject: [PATCH 2/3] =?UTF-8?q?docs:=20=F0=9F=93=9D=20refactor=20Office=20?= =?UTF-8?q?splitter=20to=20separate=20format=20handlers=20with=20security?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactor the Office splitter design document with major architectural changes: - Split single OfficeSplitter into WordSplitter, PPTSplitter, ExcelSplitter - Remove legacy OLE format support (.doc, .ppt, .xls) - only OOXML supported - Add PathValidator for security against path traversal attacks - Add PageRangeParser for page/sheet range selection - Add RenderWindowPoolFactory for shared rendering resources - Add ChunkedRenderer for memory-optimized large document rendering - Update architecture diagrams to reflect new modular design - Add comprehensive test specifications for all components This design provides better maintainability through separation of concerns and improved security with explicit path validation. Co-Authored-By: Claude --- docs/OFFICE_SPLITTER_DESIGN.md | 2626 ++++++++++++++++++++++---------- 1 file changed, 1842 insertions(+), 784 deletions(-) diff --git a/docs/OFFICE_SPLITTER_DESIGN.md b/docs/OFFICE_SPLITTER_DESIGN.md index 58fb1b6..f95b443 100644 --- a/docs/OFFICE_SPLITTER_DESIGN.md +++ b/docs/OFFICE_SPLITTER_DESIGN.md @@ -5,16 +5,30 @@ 本文档描述如何扩展 MarkPDFdown Desktop 以支持 Word、PowerPoint 和 Excel 文件的转换。 ### 目标 -- 支持 `word` 类型(.doc, .docx, .dot, .dotx) -- 支持 `powerpoint` 类型(.ppt, .pptx, .pot, .potx) -- 支持 `excel` 类型(.xls, .xlsx, .xlt, .xltx, .csv) -- 轻量级实现(包大小增加 < 4MB) + +- 支持 `word` 类型(.docx, .dotx) +- 支持 `powerpoint` 类型(.pptx, .potx) +- 支持 `excel` 类型(.xlsx, .xltx, .csv) +- 支持页面范围选择(与 PDF 功能对齐) +- 各格式使用专门的解析库,确保最佳兼容性 - 复用 Electron 渲染能力,无需额外浏览器进程 ### 设计原则 + - 统一使用 `docType`(文档类型)进行分支判断,而非文件扩展名 - 遵循现有清洁架构(ISplitter 接口 → SplitterFactory) -- 与现有 PDF/Image 分割器保持一致的 API +- 每种 Office 格式独立 Splitter 类,使用最适合的解析库 +- 安全优先:验证文件路径,防止路径遍历攻击 + +### 不支持的格式 + +以下旧格式(OLE 复合文档)**不支持**,因为依赖库无法处理: + +- `.doc`, `.dot` (旧版 Word) +- `.ppt`, `.pot` (旧版 PowerPoint) +- `.xls`, `.xlt` (旧版 Excel) + +如需处理这些格式,建议用户先用 Microsoft Office 或 LibreOffice 转换为新格式。 --- @@ -32,36 +46,40 @@ export enum DocType { EXCEL = 'excel', } -// 扩展名到文档类型的映射 +// 扩展名到文档类型的映射(仅支持 Office Open XML 格式) export const EXTENSION_TO_DOCTYPE: Record = { // PDF 'pdf': DocType.PDF, - + // Image 'jpg': DocType.IMAGE, 'jpeg': DocType.IMAGE, 'png': DocType.IMAGE, 'webp': DocType.IMAGE, - - // Word - 'doc': DocType.WORD, + + // Word (仅 Office Open XML 格式) 'docx': DocType.WORD, - 'dot': DocType.WORD, 'dotx': DocType.WORD, - - // PowerPoint - 'ppt': DocType.POWERPOINT, + + // PowerPoint (仅 Office Open XML 格式) 'pptx': DocType.POWERPOINT, - 'pot': DocType.POWERPOINT, 'potx': DocType.POWERPOINT, - - // Excel - 'xls': DocType.EXCEL, + + // Excel (仅 Office Open XML 格式 + CSV) 'xlsx': DocType.EXCEL, - 'xlt': DocType.EXCEL, 'xltx': DocType.EXCEL, 'csv': DocType.EXCEL, }; + +// 不支持的旧格式(用于友好提示) +export const LEGACY_FORMATS = ['doc', 'dot', 'ppt', 'pot', 'xls', 'xlt']; + +/** + * 检查是否为不支持的旧格式 + */ +export function isLegacyFormat(ext: string): boolean { + return LEGACY_FORMATS.includes(ext.toLowerCase().replace('.', '')); +} ``` ### 2. 类图 @@ -86,14 +104,29 @@ export const EXTENSION_TO_DOCTYPE: Record = { │ ┌────────┴─────────┬──────────────────┬──────────────────┐ │ │ │ │ │ │ │ │ ▼ ▼ ▼ ▼ │ -│ ┌──────────────┐ ┌──────────────┐ ┌──────────────────────────────────┐ │ -│ │ PDFSplitter │ │ImageSplitter │ │ OfficeSplitter (NEW) │ │ -│ ├──────────────┤ ├──────────────┤ ├──────────────────────────────────┤ │ -│ │ pdf-to-png │ │ fs.copyFile │ │ - mammoth.js (Word → HTML) │ │ -│ │ pdf-lib │ │ │ │ - jszip (PPT → HTML) │ │ -│ └──────────────┘ └──────────────┘ │ - xlsx (Excel → HTML) │ │ -│ │ - BrowserWindow (HTML → PNG) │ │ -│ └──────────────────────────────────┘ │ +│ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ │ +│ │ PDFSplitter │ │ImageSplitter │ │ WordSplitter │ │ PPTSplitter │ │ +│ ├──────────────┤ ├──────────────┤ ├──────────────┤ ├──────────────┤ │ +│ │ pdf-to-png │ │ fs.copyFile │ │ mammoth │ │ JSZip │ │ +│ │ pdf-lib │ │ │ │ (DOCX→HTML) │ │ (PPTX解压) │ │ +│ └──────────────┘ └──────────────┘ └──────────────┘ └──────────────┘ │ +│ │ +│ ┌──────────────┐ │ +│ │ExcelSplitter │ │ +│ ├──────────────┤ │ +│ │ exceljs │ │ +│ │ papaparse │ │ +│ └──────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────────────┐ │ +│ │ 共享辅助模块 │ │ +│ ├─────────────────────────────────────────────────────────────────┤ │ +│ │ - RenderWindowPoolFactory: 窗口池工厂(非单例) │ │ +│ │ - TempFileManager: 临时 HTML 文件管理 │ │ +│ │ - PathValidator: 路径安全验证(防止路径遍历) │ │ +│ │ - ChunkedRenderer: 分段截图渲染器(内存优化) │ │ +│ │ - PageRangeParser: 页面范围解析 │ │ +│ └─────────────────────────────────────────────────────────────────┘ │ │ │ │ ┌─────────────────────────────────────────────────────────────────┐ │ │ │ SplitterFactory (修改) │ │ @@ -109,29 +142,40 @@ export const EXTENSION_TO_DOCTYPE: Record = { ``` ┌──────────────────┐ ┌────────────────────┐ ┌─────────────────────┐ -│ 上传文件 │───▶│ SplitterFactory │───▶│ OfficeSplitter │ +│ 上传文件 │───▶│ SplitterFactory │───▶│ 具体 Splitter │ │ .docx/.pptx/.xlsx│ │ getDocType() │ │ │ └──────────────────┘ │ create(docType) │ └──────────┬──────────┘ └────────────────────┘ │ ▼ - ┌────────────────────────────────────────────────────┐ - │ split(task) │ - ├────────────────────────────────────────────────────┤ - │ 1. 根据 docType 选择解析策略 │ - │ - word: mammoth.js 转 HTML │ - │ - powerpoint: jszip 解析幻灯片 XML │ - │ - excel: xlsx 库解析工作表 │ - │ │ - │ 2. 渲染 HTML 为图片 │ - │ - 创建隐藏 BrowserWindow │ - │ - loadURL (data:text/html) │ - │ - capturePage() 截图 │ - │ │ - │ 3. 分页策略 │ - │ - word: 按内容高度分页(A4 比例) │ - │ - powerpoint: 每张幻灯片一页 │ - │ - excel: 按 Sheet 分页 + 智能尺寸计算 │ - └────────────────────────────────────────────────────┘ +┌────────────────────────────────────────────────────────────────────────┐ +│ WordSplitter │ +├────────────────────────────────────────────────────────────────────────┤ +│ 1. mammoth 解析 DOCX → HTML(保留样式、表格、图片) │ +│ 2. 渲染 HTML → 分段截图(每 4000px) │ +│ 3. 按 A4 高度切分为页面 │ +│ 4. pageRange 基于渲染后页码过滤 │ +└────────────────────────────────────────────────────────────────────────┘ + +┌────────────────────────────────────────────────────────────────────────┐ +│ PowerPointSplitter │ +├────────────────────────────────────────────────────────────────────────┤ +│ 1. JSZip 解压 PPTX 文件 │ +│ 2. 解析 ppt/slides/slide*.xml 获取幻灯片列表 │ +│ 3. 每张幻灯片独立构建 HTML │ +│ 4. 渲染为固定尺寸 PNG(16:9) │ +│ 5. pageRange 基于幻灯片编号过滤 │ +└────────────────────────────────────────────────────────────────────────┘ + +┌────────────────────────────────────────────────────────────────────────┐ +│ ExcelSplitter │ +├────────────────────────────────────────────────────────────────────────┤ +│ 1. exceljs 解析 XLSX / papaparse 解析 CSV │ +│ 2. 获取 Sheet 列表和数据 │ +│ 3. 每个 Sheet 构建 HTML 表格 │ +│ 4. 分段截图 + 垂直分页 │ +│ 5. pageRange 基于 Sheet 索引/名称过滤 │ +└────────────────────────────────────────────────────────────────────────┘ + │ ▼ ┌────────────────────────────────────────────────────┐ @@ -144,783 +188,1714 @@ export const EXTENSION_TO_DOCTYPE: Record = { ## 详细设计 -### 1. SplitterFactory 改造 +### 1. 路径安全验证器 ```typescript -// src/core/infrastructure/adapters/split/SplitterFactory.ts +// src/core/infrastructure/adapters/split/PathValidator.ts import path from 'path'; -import { ISplitter } from '../../../domain/split/ISplitter.js'; -import { PDFSplitter } from './PDFSplitter.js'; -import { ImageSplitter } from './ImageSplitter.js'; -import { OfficeSplitter } from './OfficeSplitter.js'; -import { DocType, EXTENSION_TO_DOCTYPE } from '../../../../shared/types/DocType.js'; -export class SplitterFactory { - private readonly uploadsDir: string; +/** + * 路径安全验证器 + * + * 防止路径遍历攻击(如 ../../../etc/passwd) + */ +export class PathValidator { + /** + * 验证文件路径是否在允许的目录内 + * + * @param filePath - 待验证的文件路径 + * @param allowedDir - 允许的根目录 + * @throws 如果路径不在允许目录内 + */ + static validate(filePath: string, allowedDir: string): void { + const resolvedPath = path.resolve(filePath); + const resolvedAllowedDir = path.resolve(allowedDir); - constructor(uploadsDir: string) { - this.uploadsDir = uploadsDir; + // 修复:支持路径等于允许目录本身的情况 + const isWithinDir = + resolvedPath === resolvedAllowedDir || + resolvedPath.startsWith(resolvedAllowedDir + path.sep); + + if (!isWithinDir) { + throw new Error( + `Security error: Path "${filePath}" is outside allowed directory. ` + + `Possible path traversal attack detected.` + ); + } } /** - * 根据文档类型创建对应的分割器 - * - * @param docType - 文档类型(统一使用 DocType 枚举) - * @returns 对应的分割器实例 + * 安全地拼接路径并验证 + * + * @param baseDir - 基础目录 + * @param segments - 路径片段 + * @returns 验证后的完整路径 */ - create(docType: DocType): ISplitter { - switch (docType) { - case DocType.PDF: - return new PDFSplitter(this.uploadsDir); + static safePath(baseDir: string, ...segments: string[]): string { + // 循环移除 .. 直到没有变化(防止 .... 等绕过) + const sanitizedSegments = segments.map(seg => { + let prev = ''; + let current = seg; + while (prev !== current) { + prev = current; + current = current + .replace(/\.\./g, '') + .replace(/^[/\\]+/, '') + .replace(/[/\\]+$/, ''); + } + return current; + }); - case DocType.IMAGE: - return new ImageSplitter(this.uploadsDir); + const fullPath = path.join(baseDir, ...sanitizedSegments); + this.validate(fullPath, baseDir); - case DocType.WORD: - case DocType.POWERPOINT: - case DocType.EXCEL: - return new OfficeSplitter(this.uploadsDir, docType); + return fullPath; + } +} +``` - default: - const supportedTypes = Object.values(DocType).join(', '); - throw new Error( - `Unsupported document type: ${docType}. Supported types: ${supportedTypes}` - ); +### 2. 页面范围解析器 + +```typescript +// src/core/infrastructure/adapters/split/PageRangeParser.ts + +export interface ParsedRange { + /** 包含的页码/索引列表(1-based) */ + indices: number[]; + /** 原始范围字符串 */ + raw: string; +} + +export interface SheetRange { + /** 解析类型 */ + type: 'indices' | 'names'; + /** 按索引指定(1-based) */ + indices?: number[]; + /** 按名称指定 */ + names?: string[]; + /** 原始范围字符串 */ + raw: string; +} + +/** + * 页面范围解析器 + * + * 支持格式: + * - 单页: "3" + * - 范围: "1-5" + * - 混合: "1,3,5-10" + * - Sheet 索引: "#1-2" 或 "#1,3"(# 前缀明确表示索引) + * - Sheet 名称: "Sheet1,数据表"(无 # 前缀) + */ +export class PageRangeParser { + /** + * 解析数字页码范围 + * + * @param range - 范围字符串,如 "1-3,5,7-9" + * @param maxPage - 最大页码(用于验证和开区间) + */ + static parseNumeric(range: string | undefined, maxPage: number): ParsedRange { + if (!range || range.trim() === '') { + // 未指定范围,返回全部页 + return { + indices: Array.from({ length: maxPage }, (_, i) => i + 1), + raw: '', + }; + } + + const indices = new Set(); + const parts = range.split(',').map(p => p.trim()).filter(Boolean); + + for (const part of parts) { + if (part.includes('-')) { + // 范围格式: "1-5" + const [startStr, endStr] = part.split('-').map(s => s.trim()); + const start = parseInt(startStr, 10); + const end = endStr === '' ? maxPage : parseInt(endStr, 10); + + if (isNaN(start) || isNaN(end)) { + throw new Error(`Invalid range format: "${part}"`); + } + if (start < 1 || end > maxPage || start > end) { + throw new Error( + `Range "${part}" is out of bounds (valid: 1-${maxPage})` + ); + } + + for (let i = start; i <= end; i++) { + indices.add(i); + } + } else { + // 单页格式: "3" + const page = parseInt(part, 10); + if (isNaN(page)) { + throw new Error(`Invalid page number: "${part}"`); + } + if (page < 1 || page > maxPage) { + throw new Error( + `Page ${page} is out of bounds (valid: 1-${maxPage})` + ); + } + indices.add(page); + } } + + return { + indices: Array.from(indices).sort((a, b) => a - b), + raw: range, + }; } /** - * 从文件名获取文档类型 - * - * @param filename - 文件名 - * @returns 文档类型 + * 解析 Excel Sheet 范围 + * + * 支持格式: + * - 索引: "#1-2" 或 "#1,3"(# 前缀明确表示索引) + * - 名称: "Sheet1,数据表"(无 # 前缀,即使 Sheet 名是数字) + * + * @param range - 范围字符串 + * @param sheetNames - 可用的 Sheet 名称列表 */ - static getDocType(filename: string): DocType { - const ext = path.extname(filename); - if (!ext || ext === '.') { - throw new Error(`Filename has no extension: ${filename}`); + static parseSheetRange( + range: string | undefined, + sheetNames: string[] + ): SheetRange { + if (!range || range.trim() === '') { + // 未指定,返回全部 Sheet + return { + type: 'indices', + indices: Array.from({ length: sheetNames.length }, (_, i) => i + 1), + raw: '', + }; } - const normalizedExt = ext.slice(1).toLowerCase(); - const docType = EXTENSION_TO_DOCTYPE[normalizedExt]; + const trimmed = range.trim(); - if (!docType) { - const supportedExts = Object.keys(EXTENSION_TO_DOCTYPE).join(', '); - throw new Error( - `Unsupported file extension: ${ext}. Supported extensions: ${supportedExts}` - ); - } + // 使用 # 前缀明确区分索引和名称 + if (trimmed.startsWith('#')) { + // 按索引解析(去掉 # 前缀) + const indexRange = trimmed.slice(1); + const parsed = this.parseNumeric(indexRange, sheetNames.length); + return { + type: 'indices', + indices: parsed.indices, + raw: range, + }; + } else { + // 按名称解析 + const names = trimmed.split(',').map(n => n.trim()).filter(Boolean); + const invalidNames = names.filter(n => !sheetNames.includes(n)); - return docType; + if (invalidNames.length > 0) { + throw new Error( + `Sheet not found: "${invalidNames.join('", "')}". ` + + `Available sheets: "${sheetNames.join('", "')}"` + ); + } + + return { + type: 'names', + names, + raw: range, + }; + } } /** - * 从文件名创建分割器(便捷方法) + * 根据 SheetRange 过滤 Sheet 列表 */ - createFromFilename(filename: string): ISplitter { - const docType = SplitterFactory.getDocType(filename); - return this.create(docType); + static filterSheets( + sheetNames: string[], + range: SheetRange + ): string[] { + if (range.type === 'names' && range.names) { + // 按名称保持用户指定顺序 + return range.names; + } + if (range.type === 'indices' && range.indices) { + // 按索引过滤(1-based) + return range.indices.map(i => sheetNames[i - 1]); + } + return sheetNames; } } ``` -### 2. OfficeSplitter 实现 +### 3. 渲染窗口池工厂 ```typescript -// src/core/infrastructure/adapters/split/OfficeSplitter.ts -import { promises as fs } from 'fs'; -import path from 'path'; -import mammoth from 'mammoth'; +// src/core/infrastructure/adapters/split/RenderWindowPoolFactory.ts import { BrowserWindow } from 'electron'; -import { ISplitter, SplitResult, PageInfo } from '../../../domain/split/ISplitter.js'; -import { Task } from '../../../../shared/types/index.js'; -import { DocType } from '../../../../shared/types/DocType.js'; -import { ImagePathUtil } from './ImagePathUtil.js'; -import { WORKER_CONFIG } from '../../config/worker.config.js'; -/** - * 页面配置常量 - */ -const PAGE_CONFIG = { - /** A4 页面宽度(像素,96 DPI) */ - PAGE_WIDTH: 794, - /** A4 页面高度(像素,96 DPI) */ - PAGE_HEIGHT: 1123, - /** PPT 幻灯片宽度 */ - SLIDE_WIDTH: 1280, - /** PPT 幻灯片高度(16:9) */ - SLIDE_HEIGHT: 720, - /** 渲染缩放因子 */ - DEVICE_SCALE_FACTOR: 2, -}; +interface PooledWindow { + window: BrowserWindow; + busy: boolean; +} + +interface WaitingRequest { + resolve: (window: BrowserWindow) => void; + reject: (error: Error) => void; + width: number; + height: number; + timer: NodeJS.Timeout; +} /** - * Excel 页面配置常量 + * 窗口池配置 */ -const EXCEL_CONFIG = { - /** 最大渲染宽度 */ - MAX_WIDTH: 1600, - /** 最大渲染高度(单次截图) */ - MAX_HEIGHT: 2000, - /** 每列默认宽度 */ - DEFAULT_COL_WIDTH: 100, - /** 最小列宽 */ - MIN_COL_WIDTH: 60, - /** 行高 */ - ROW_HEIGHT: 28, - /** 最小页面宽度 */ - MIN_WIDTH: 800, +export interface RenderWindowPoolConfig { + /** 最大窗口数量 */ + maxSize: number; + /** 等待超时时间(毫秒) */ + acquireTimeout: number; +} + +const DEFAULT_CONFIG: RenderWindowPoolConfig = { + maxSize: 3, + acquireTimeout: 60000, // 60 秒 }; /** - * Office 文件分割器 - * - * 支持: - * - Word 文档:.doc, .docx, .dot, .dotx - * - PowerPoint 演示文稿:.ppt, .pptx, .pot, .potx - * - Excel 电子表格:.xls, .xlsx, .xlt, .xltx, .csv - * - * 技术方案: - * - 使用 mammoth.js 将 Word 文档转换为 HTML - * - 使用 jszip 解析 PowerPoint 文件 - * - 使用 xlsx (SheetJS) 解析 Excel 文件 - * - 复用 Electron BrowserWindow 进行 HTML → PNG 渲染 + * BrowserWindow 渲染窗口池 + * + * 避免频繁创建/销毁窗口,控制并发资源消耗 + * 包含超时机制防止无限等待 + * + * 注意:使用工厂模式创建,每个 Splitter 可拥有独立的窗口池 */ -export class OfficeSplitter implements ISplitter { - private readonly uploadsDir: string; - private readonly docType: DocType; - - constructor(uploadsDir: string, docType: DocType) { - this.uploadsDir = uploadsDir; - this.docType = docType; +export class RenderWindowPool { + private pool: PooledWindow[] = []; + private readonly config: RenderWindowPoolConfig; + private waitQueue: WaitingRequest[] = []; + private destroyed = false; + + constructor(config: RenderWindowPoolConfig) { + this.config = config; } /** - * 分割 Office 文件为页面图片 + * 获取一个可用的渲染窗口 + * + * @throws 如果等待超时或池已销毁 */ - async split(task: Task): Promise { - if (!task.id) { - throw new Error('Task ID is required'); - } - if (!task.filename) { - throw new Error('Task filename is required'); + async acquire(width: number, height: number): Promise { + if (this.destroyed) { + throw new Error('RenderWindowPool has been destroyed'); } - const taskId = task.id; - const filename = task.filename; - const sourcePath = path.join(this.uploadsDir, taskId, filename); + // 查找空闲窗口 + const available = this.pool.find(p => !p.busy); + if (available) { + available.busy = true; + available.window.setSize(width, height); + return available.window; + } - try { - // 确保源文件存在 - await fs.access(sourcePath); + // 池未满则创建新窗口 + if (this.pool.length < this.config.maxSize) { + const window = this.createWindow(width, height); + this.pool.push({ window, busy: true }); + return window; + } - // 确保输出目录存在 - const taskDir = ImagePathUtil.getTaskDir(taskId); - await fs.mkdir(taskDir, { recursive: true }); + // 池已满,等待释放(带超时) + return new Promise((resolve, reject) => { + const timer = setTimeout(() => { + // 移除等待请求 + const index = this.waitQueue.findIndex(r => r.timer === timer); + if (index !== -1) { + this.waitQueue.splice(index, 1); + } + reject(new Error( + `RenderWindowPool acquire timeout after ${this.config.acquireTimeout}ms. ` + + `All ${this.config.maxSize} windows are busy.` + )); + }, this.config.acquireTimeout); + + this.waitQueue.push({ + resolve, + reject, + width, + height, + timer, + }); + }); + } - // 根据文档类型选择处理策略 - let pages: PageInfo[]; - - switch (this.docType) { - case DocType.WORD: - pages = await this.splitWord(sourcePath, taskId); - break; - case DocType.POWERPOINT: - pages = await this.splitPowerPoint(sourcePath, taskId); - break; - case DocType.EXCEL: - pages = await this.splitExcel(sourcePath, taskId); - break; - default: - throw new Error(`OfficeSplitter does not support docType: ${this.docType}`); + /** + * 释放窗口回池 + */ + async release(window: BrowserWindow): Promise { + const pooled = this.pool.find(p => p.window === window); + if (!pooled) return; + + // 检查窗口是否仍然有效 + if (window.isDestroyed()) { + // 窗口已销毁,从池中移除 + const index = this.pool.indexOf(pooled); + if (index !== -1) { + this.pool.splice(index, 1); } + return; + } - return { - pages, - totalPages: pages.length, - }; - } catch (error) { - throw this.wrapError(error, taskId, filename); + // 清理窗口状态(等待完成) + try { + await window.loadURL('about:blank'); + } catch { + // 忽略清理失败 + } + + // 如果有等待者,直接分配 + if (this.waitQueue.length > 0) { + const waiter = this.waitQueue.shift()!; + clearTimeout(waiter.timer); + window.setSize(waiter.width, waiter.height); + waiter.resolve(window); + } else { + pooled.busy = false; } } /** - * 分割 Word 文档 + * 销毁所有窗口 */ - private async splitWord(sourcePath: string, taskId: string): Promise { - // 使用 mammoth.js 将 docx 转换为 HTML - const result = await mammoth.convertToHtml({ path: sourcePath }); - const html = result.value; - - // 如果有警告,记录日志 - if (result.messages.length > 0) { - console.warn(`[OfficeSplitter] Word conversion warnings:`, result.messages); + destroy(): void { + this.destroyed = true; + + // 拒绝所有等待中的请求 + for (const waiter of this.waitQueue) { + clearTimeout(waiter.timer); + waiter.reject(new Error('RenderWindowPool is being destroyed')); + } + this.waitQueue = []; + + // 销毁所有窗口 + for (const pooled of this.pool) { + if (!pooled.window.isDestroyed()) { + pooled.window.destroy(); + } } + this.pool = []; + } - // 构建完整 HTML 页面 - const fullHtml = this.buildWordHtml(html); + /** + * 获取当前池状态(用于调试) + */ + getStatus(): { total: number; busy: number; waiting: number } { + return { + total: this.pool.length, + busy: this.pool.filter(p => p.busy).length, + waiting: this.waitQueue.length, + }; + } - // 渲染为图片(按页分割) - return this.renderHtmlToPages(fullHtml, taskId, DocType.WORD); + private createWindow(width: number, height: number): BrowserWindow { + return new BrowserWindow({ + show: false, + width, + height, + webPreferences: { + nodeIntegration: false, + contextIsolation: true, + offscreen: true, + }, + }); } +} +/** + * 窗口池工厂 + * + * 每次调用创建独立的窗口池实例 + */ +export class RenderWindowPoolFactory { /** - * 分割 PowerPoint 演示文稿 + * 创建新的窗口池实例 */ - private async splitPowerPoint(sourcePath: string, taskId: string): Promise { - // 解析 PPTX 文件 - const slides = await this.parsePptx(sourcePath); + static create(config: Partial = {}): RenderWindowPool { + return new RenderWindowPool({ + ...DEFAULT_CONFIG, + ...config, + }); + } +} +``` - const pages: PageInfo[] = []; +### 4. 分段截图渲染器 - // 每张幻灯片单独渲染 - for (let i = 0; i < slides.length; i++) { - const slideHtml = this.buildSlideHtml(slides[i], i + 1); - const pageImages = await this.renderHtmlToPages(slideHtml, taskId, DocType.POWERPOINT, i + 1); - pages.push(...pageImages); - } +```typescript +// src/core/infrastructure/adapters/split/ChunkedRenderer.ts +import { BrowserWindow } from 'electron'; +import { promises as fs } from 'fs'; + +/** + * 分段截图配置 + */ +export interface ChunkedRenderConfig { + /** 每段截图高度(像素) */ + chunkHeight: number; + /** 设备缩放因子 */ + deviceScaleFactor: number; + /** 页面宽度 */ + pageWidth: number; + /** 分页高度(用于切分输出) */ + pageHeight: number; +} + +const DEFAULT_CHUNK_CONFIG: ChunkedRenderConfig = { + chunkHeight: 4000, + deviceScaleFactor: 2, + pageWidth: 794, + pageHeight: 1123, +}; + +/** + * 分段截图渲染器 + * + * 解决大文档单次截图内存过大的问题 + * 每次只截取 chunkHeight 高度的区域 + */ +export class ChunkedRenderer { + private readonly config: ChunkedRenderConfig; - return pages; + constructor(config: Partial = {}) { + this.config = { ...DEFAULT_CHUNK_CONFIG, ...config }; } /** - * 分割 Excel 电子表格 - * - * 策略:按 Sheet 分页 + 智能尺寸计算 - * - 每个 Sheet 独立渲染 - * - 根据列数动态计算宽度 - * - 超长内容自动垂直分页 + * 分段截图并切分为页面 + * + * @param window - 渲染窗口 + * @param totalHeight - 文档总高度(CSS 像素) + * @param outputPathFn - 输出路径生成函数 (pageNum) => path + * @returns 生成的页面数量 */ - private async splitExcel(sourcePath: string, taskId: string): Promise { - const XLSX = await import('xlsx'); - - // 读取 Excel 文件 - const workbook = XLSX.read(await fs.readFile(sourcePath), { type: 'buffer' }); - - if (workbook.SheetNames.length === 0) { - throw new Error('Excel file contains no sheets'); - } - - const pages: PageInfo[] = []; - let pageIndex = 0; - - // 遍历每个 Sheet - for (const sheetName of workbook.SheetNames) { - const worksheet = workbook.Sheets[sheetName]; - - // 获取数据范围 - const range = XLSX.utils.decode_range(worksheet['!ref'] || 'A1'); - const colCount = range.e.c - range.s.c + 1; - const rowCount = range.e.r - range.s.r + 1; - - // 计算渲染尺寸 - const dimensions = this.calculateExcelDimensions(colCount, rowCount); - - // 转换为 HTML 表格 - const htmlTable = XLSX.utils.sheet_to_html(worksheet, { - editable: false, - header: '', - footer: '', + async renderToPages( + window: BrowserWindow, + totalHeight: number, + outputPathFn: (pageNum: number) => string + ): Promise { + const { chunkHeight, deviceScaleFactor, pageWidth, pageHeight } = this.config; + const sharp = (await import('sharp')).default; + + const scaledChunkHeight = chunkHeight * deviceScaleFactor; + const scaledPageHeight = pageHeight * deviceScaleFactor; + const scaledPageWidth = pageWidth * deviceScaleFactor; + + let pageNum = 1; + let processedHeight = 0; + let carryOverBuffer: Buffer | null = null; + let carryOverHeight = 0; + + while (processedHeight < totalHeight) { + // 计算本次截图区域 + const captureHeight = Math.min(chunkHeight, totalHeight - processedHeight); + + // 滚动到目标位置 + await window.webContents.executeJavaScript( + `window.scrollTo(0, ${processedHeight})` + ); + await this.sleep(50); + + // 截取当前区域 + const image = await window.webContents.capturePage({ + x: 0, + y: 0, + width: pageWidth, + height: captureHeight, }); + const chunkBuffer = image.toPNG(); + + // 合并上一段的剩余部分 + let workingBuffer: Buffer; + let workingHeight: number; + + if (carryOverBuffer) { + // 垂直拼接 carryOver + 当前 chunk + workingBuffer = await sharp(carryOverBuffer) + .extend({ + bottom: captureHeight * deviceScaleFactor, + background: { r: 255, g: 255, b: 255, alpha: 1 }, + }) + .composite([{ + input: chunkBuffer, + top: carryOverHeight, + left: 0, + }]) + .toBuffer(); + workingHeight = carryOverHeight + captureHeight * deviceScaleFactor; + carryOverBuffer = null; + carryOverHeight = 0; + } else { + workingBuffer = chunkBuffer; + workingHeight = captureHeight * deviceScaleFactor; + } + + // 从 workingBuffer 切分出完整页面 + let extractedHeight = 0; + while (extractedHeight + scaledPageHeight <= workingHeight) { + const outputPath = outputPathFn(pageNum); + await sharp(workingBuffer) + .extract({ + left: 0, + top: extractedHeight, + width: scaledPageWidth, + height: scaledPageHeight, + }) + .toFile(outputPath); + + pageNum++; + extractedHeight += scaledPageHeight; + } - // 构建完整 HTML - const fullHtml = this.buildExcelHtml(htmlTable, sheetName, dimensions.width); + // 保存剩余部分用于下一轮 + if (extractedHeight < workingHeight) { + const remainingHeight = workingHeight - extractedHeight; + carryOverBuffer = await sharp(workingBuffer) + .extract({ + left: 0, + top: extractedHeight, + width: scaledPageWidth, + height: remainingHeight, + }) + .toBuffer(); + carryOverHeight = remainingHeight; + } - // 渲染为图片(可能需要垂直分页) - const sheetPages = await this.renderExcelToPages( - fullHtml, - taskId, - pageIndex, - dimensions - ); + processedHeight += captureHeight; + } - pages.push(...sheetPages); - pageIndex += sheetPages.length; + // 处理最后的剩余部分(不足一页) + if (carryOverBuffer && carryOverHeight > 0) { + const outputPath = outputPathFn(pageNum); + await sharp(carryOverBuffer).toFile(outputPath); + pageNum++; } - return pages; + return pageNum - 1; } + private sleep(ms: number): Promise { + return new Promise(resolve => setTimeout(resolve, ms)); + } +} +``` + +### 5. 临时文件管理器 + +```typescript +// src/core/infrastructure/adapters/split/TempFileManager.ts +import { promises as fs } from 'fs'; +import path from 'path'; +import os from 'os'; +import { randomUUID } from 'crypto'; + +/** + * 临时文件管理器 + * + * 用于创建和清理 HTML 渲染临时文件 + * 解决 data URL 长度限制问题 + */ +export class TempFileManager { + private static readonly TEMP_PREFIX = 'markpdfdown-render-'; + private tempFiles: Set = new Set(); + /** - * 计算 Excel Sheet 的渲染尺寸 + * 创建临时 HTML 文件 */ - private calculateExcelDimensions( - colCount: number, - rowCount: number - ): { width: number; height: number; pages: number } { - // 计算内容宽度 - const contentWidth = Math.min( - colCount * EXCEL_CONFIG.DEFAULT_COL_WIDTH, - EXCEL_CONFIG.MAX_WIDTH - ); - - // 计算内容高度 - const contentHeight = rowCount * EXCEL_CONFIG.ROW_HEIGHT; - - // 计算需要多少页 - const pages = Math.ceil(contentHeight / EXCEL_CONFIG.MAX_HEIGHT); - - return { - width: Math.max(contentWidth, EXCEL_CONFIG.MIN_WIDTH), - height: Math.min(contentHeight, EXCEL_CONFIG.MAX_HEIGHT), - pages, - }; + async createHtmlFile(html: string): Promise { + const tempDir = os.tmpdir(); + const filename = `${TempFileManager.TEMP_PREFIX}${randomUUID()}.html`; + const filepath = path.join(tempDir, filename); + + await fs.writeFile(filepath, html, 'utf-8'); + this.tempFiles.add(filepath); + + return filepath; } /** - * 构建 Excel 表格的完整 HTML + * 删除单个临时文件 */ - private buildExcelHtml(tableHtml: string, sheetName: string, width: number): string { - return ` - - - - - + +${content} +`; + } + + /** + * 加载 HTML 文件并等待渲染完成 + */ + private loadAndWait(window: Electron.BrowserWindow, htmlPath: string): Promise { + return new Promise((resolve, reject) => { + let timeoutId: NodeJS.Timeout; + + const cleanup = () => { + clearTimeout(timeoutId); + }; + + timeoutId = setTimeout(() => { + cleanup(); + reject(new Error('Page load timeout')); + }, 30000); + + window.webContents.once('did-finish-load', () => { + cleanup(); + setTimeout(resolve, 200); + }); + + window.webContents.once('did-fail-load', (_event, errorCode, errorDesc) => { + cleanup(); + reject(new Error(`Failed to load page: ${errorDesc} (${errorCode})`)); + }); + + window.loadFile(htmlPath).catch((err) => { + cleanup(); + reject(err); + }); + }); + } + + /** + * 错误包装 + */ + private wrapError(error: unknown, filename: string): Error { + const err = error as Error; + const message = err.message.toLowerCase(); + + if (message.includes('security error') || message.includes('path traversal')) { + return err; } - th, td { - border: 1px solid #ddd; - padding: 8px 12px; - text-align: left; - white-space: nowrap; - overflow: hidden; - text-overflow: ellipsis; - max-width: 300px; + if (message.includes('enoent') || message.includes('no such file')) { + return new Error(`Word file not found: ${filename}`); } - th { - background-color: #f5f5f5; - font-weight: 600; - color: #333; + if (message.includes('corrupt') || message.includes('invalid')) { + return new Error(`Word file appears to be corrupted: ${filename}`); } - tr:nth-child(even) { - background-color: #fafafa; + + return new Error(`Failed to process Word file ${filename}: ${err.message}`); + } + + /** + * 清理任务临时文件 + */ + async cleanup(taskId: string): Promise { + const taskDir = ImagePathUtil.getTaskDir(taskId); + await fs.rm(taskDir, { recursive: true, force: true }).catch(() => {}); + this.windowPool.destroy(); + } +} +``` + +### 8. PowerPointSplitter 实现 + +```typescript +// src/core/infrastructure/adapters/split/PowerPointSplitter.ts +import { promises as fs } from 'fs'; +import JSZip from 'jszip'; +import { XMLParser } from 'fast-xml-parser'; +import { ISplitter, SplitResult, PageInfo } from '../../../domain/split/ISplitter.js'; +import { Task } from '../../../../shared/types/index.js'; +import { ImagePathUtil } from './ImagePathUtil.js'; +import { RenderWindowPoolFactory, RenderWindowPool } from './RenderWindowPoolFactory.js'; +import { TempFileManager } from './TempFileManager.js'; +import { PageRangeParser } from './PageRangeParser.js'; +import { PathValidator } from './PathValidator.js'; + +/** + * 幻灯片配置常量 + */ +const SLIDE_CONFIG = { + /** 幻灯片宽度 */ + WIDTH: 1280, + /** 幻灯片高度(16:9) */ + HEIGHT: 720, + /** 渲染缩放因子 */ + DEVICE_SCALE_FACTOR: 2, +} as const; + +/** + * 解析后的幻灯片数据 + */ +interface SlideData { + index: number; + title?: string; + content: string[]; + notes?: string; + background?: string; +} + +/** + * PowerPoint 文件分割器 + * + * 支持:.pptx, .potx + * + * 技术方案: + * - 使用 JSZip 解压 PPTX 文件 + * - 解析 ppt/slides/slide*.xml 获取幻灯片内容 + * - 每张幻灯片独立渲染为 PNG + */ +export class PowerPointSplitter implements ISplitter { + private readonly uploadsDir: string; + private readonly windowPool: RenderWindowPool; + private readonly tempFileManager: TempFileManager; + private readonly xmlParser: XMLParser; + + constructor(uploadsDir: string) { + this.uploadsDir = uploadsDir; + this.windowPool = RenderWindowPoolFactory.create({ maxSize: 2, acquireTimeout: 60000 }); + this.tempFileManager = new TempFileManager(); + this.xmlParser = new XMLParser({ + ignoreAttributes: false, + attributeNamePrefix: '@_', + }); + } + + /** + * 分割 PowerPoint 文件为页面图片 + */ + async split(task: Task): Promise { + if (!task.id || !task.filename) { + throw new Error('Task ID and filename are required'); + } + + const taskId = task.id; + const filename = task.filename; + + const sourcePath = PathValidator.safePath(this.uploadsDir, taskId, filename); + + try { + const fileBuffer = await fs.readFile(sourcePath); + const zip = await JSZip.loadAsync(fileBuffer); + + const taskDir = ImagePathUtil.getTaskDir(taskId); + await fs.mkdir(taskDir, { recursive: true }); + + // 解析幻灯片 + const slides = await this.parseSlides(zip); + + if (slides.length === 0) { + throw new Error('PowerPoint file contains no slides'); + } + + // 解析页面范围 + const parsed = PageRangeParser.parseNumeric(task.pageRange, slides.length); + const selectedIndices = new Set(parsed.indices); + + // 渲染选中的幻灯片 + const pages: PageInfo[] = []; + let outputPageNum = 1; + + for (const slide of slides) { + if (!selectedIndices.has(slide.index)) { + continue; + } + + const slideHtml = this.buildSlideHtml(slide); + const imagePath = ImagePathUtil.getPath(taskId, outputPageNum); + + await this.renderSlide(slideHtml, imagePath); + + pages.push({ + page: outputPageNum, + pageSource: slide.index, + imagePath, + }); + + outputPageNum++; + } + + return { pages, totalPages: pages.length }; + } catch (error) { + throw this.wrapError(error, filename); + } finally { + await this.tempFileManager.cleanup(); } - tr:hover { - background-color: #f0f7ff; + } + + /** + * 从 PPTX 解析幻灯片数据 + */ + private async parseSlides(zip: JSZip): Promise { + const slides: SlideData[] = []; + + // 获取所有幻灯片文件 + const slideFiles = Object.keys(zip.files) + .filter(name => /^ppt\/slides\/slide\d+\.xml$/.test(name)) + .sort((a, b) => { + const numA = parseInt(a.match(/slide(\d+)/)?.[1] || '0'); + const numB = parseInt(b.match(/slide(\d+)/)?.[1] || '0'); + return numA - numB; + }); + + for (let i = 0; i < slideFiles.length; i++) { + const slideFile = slideFiles[i]; + const content = await zip.file(slideFile)?.async('text'); + + if (!content) continue; + + const parsed = this.xmlParser.parse(content); + const slideData = this.extractSlideContent(parsed, i + 1); + slides.push(slideData); } - /* 首行样式(通常是标题行) */ - tr:first-child td, - tr:first-child th { - background-color: #e8f5e9; - font-weight: 600; + + return slides; + } + + /** + * 从 XML 提取幻灯片内容 + */ + private extractSlideContent(parsed: any, index: number): SlideData { + const texts: string[] = []; + let title: string | undefined; + + // 递归提取所有文本 + const extractTexts = (obj: any): void => { + if (!obj || typeof obj !== 'object') return; + + // 提取 标签内的文本 + if (obj['a:t']) { + const text = typeof obj['a:t'] === 'string' ? obj['a:t'] : String(obj['a:t']); + if (text.trim()) { + texts.push(text.trim()); + } + } + + // 递归处理数组和对象 + for (const key of Object.keys(obj)) { + if (Array.isArray(obj[key])) { + obj[key].forEach(extractTexts); + } else if (typeof obj[key] === 'object') { + extractTexts(obj[key]); + } + } + }; + + extractTexts(parsed); + + // 第一个文本通常是标题 + if (texts.length > 0) { + title = texts[0]; + } + + return { + index, + title, + content: texts.slice(1), + }; + } + + /** + * 构建幻灯片 HTML + */ + private buildSlideHtml(slide: SlideData): string { + const titleHtml = slide.title + ? `

${this.escapeHtml(slide.title)}

` + : ''; + + const contentHtml = slide.content + .map(text => `

${this.escapeHtml(text)}

`) + .join('\n'); + + return ` + + + + + -
📊 ${this.escapeHtml(sheetName)}
- ${tableHtml} + ${titleHtml} +
${contentHtml}
+
${slide.index}
`; } /** - * 渲染 Excel HTML 为页面图片 - * - * 处理超长表格的垂直分页 + * 渲染幻灯片为图片 */ - private async renderExcelToPages( - html: string, - taskId: string, - startPageIndex: number, - dimensions: { width: number; height: number; pages: number } - ): Promise { - const pages: PageInfo[] = []; + private async renderSlide(html: string, outputPath: string): Promise { + const tempHtmlPath = await this.tempFileManager.createHtmlFile(html); + const window = await this.windowPool.acquire(SLIDE_CONFIG.WIDTH, SLIDE_CONFIG.HEIGHT); - // 创建隐藏的渲染窗口 - const renderWindow = new BrowserWindow({ - show: false, - width: dimensions.width, - height: dimensions.height, - webPreferences: { - nodeIntegration: false, - contextIsolation: true, - offscreen: true, - }, + try { + window.webContents.setZoomFactor(SLIDE_CONFIG.DEVICE_SCALE_FACTOR); + await this.loadAndWait(window, tempHtmlPath); + + const image = await window.webContents.capturePage({ + x: 0, + y: 0, + width: SLIDE_CONFIG.WIDTH, + height: SLIDE_CONFIG.HEIGHT, + }); + + await fs.writeFile(outputPath, image.toPNG()); + } finally { + await this.windowPool.release(window); + await this.tempFileManager.deleteFile(tempHtmlPath); + } + } + + /** + * 加载 HTML 文件并等待渲染完成 + */ + private loadAndWait(window: Electron.BrowserWindow, htmlPath: string): Promise { + return new Promise((resolve, reject) => { + let timeoutId: NodeJS.Timeout; + + const cleanup = () => clearTimeout(timeoutId); + + timeoutId = setTimeout(() => { + cleanup(); + reject(new Error('Page load timeout')); + }, 30000); + + window.webContents.once('did-finish-load', () => { + cleanup(); + setTimeout(resolve, 200); + }); + + window.webContents.once('did-fail-load', (_event, errorCode, errorDesc) => { + cleanup(); + reject(new Error(`Failed to load page: ${errorDesc} (${errorCode})`)); + }); + + window.loadFile(htmlPath).catch((err) => { + cleanup(); + reject(err); + }); }); + } - try { - renderWindow.webContents.setZoomFactor(PAGE_CONFIG.DEVICE_SCALE_FACTOR); + private escapeHtml(text: string): string { + return text + .replace(/&/g, '&') + .replace(//g, '>') + .replace(/"/g, '"') + .replace(/'/g, '''); + } - const dataUrl = `data:text/html;charset=utf-8,${encodeURIComponent(html)}`; - await renderWindow.loadURL(dataUrl); - await this.waitForRender(renderWindow); + private wrapError(error: unknown, filename: string): Error { + const err = error as Error; + const message = err.message.toLowerCase(); - // 获取实际内容高度 - const totalHeight = await renderWindow.webContents.executeJavaScript( - 'document.body.scrollHeight' - ); + if (message.includes('security error')) return err; + if (message.includes('enoent')) { + return new Error(`PowerPoint file not found: ${filename}`); + } + if (message.includes('invalid') || message.includes('corrupt')) { + return new Error(`PowerPoint file appears to be corrupted: ${filename}`); + } + + return new Error(`Failed to process PowerPoint file ${filename}: ${err.message}`); + } + + async cleanup(taskId: string): Promise { + const taskDir = ImagePathUtil.getTaskDir(taskId); + await fs.rm(taskDir, { recursive: true, force: true }).catch(() => {}); + this.windowPool.destroy(); + } +} +``` + +### 9. ExcelSplitter 实现 + +```typescript +// src/core/infrastructure/adapters/split/ExcelSplitter.ts +import { promises as fs } from 'fs'; +import path from 'path'; +import ExcelJS from 'exceljs'; +import Papa from 'papaparse'; +import { ISplitter, SplitResult, PageInfo } from '../../../domain/split/ISplitter.js'; +import { Task } from '../../../../shared/types/index.js'; +import { ImagePathUtil } from './ImagePathUtil.js'; +import { RenderWindowPoolFactory, RenderWindowPool } from './RenderWindowPoolFactory.js'; +import { TempFileManager } from './TempFileManager.js'; +import { ChunkedRenderer } from './ChunkedRenderer.js'; +import { PageRangeParser } from './PageRangeParser.js'; +import { PathValidator } from './PathValidator.js'; +import { EncodingDetector } from './EncodingDetector.js'; + +/** + * Excel 页面配置常量 + */ +const EXCEL_CONFIG = { + /** 最大渲染宽度 */ + MAX_WIDTH: 1600, + /** 分页高度 */ + PAGE_HEIGHT: 1200, + /** 分段截图高度 */ + CHUNK_HEIGHT: 4000, + /** 每列默认宽度 */ + DEFAULT_COL_WIDTH: 100, + /** 最小页面宽度 */ + MIN_WIDTH: 800, + /** 渲染缩放因子 */ + DEVICE_SCALE_FACTOR: 2, +} as const; + +/** + * Sheet 数据 + */ +interface SheetData { + name: string; + rows: string[][]; + colCount: number; +} + +/** + * Excel 文件分割器 + * + * 支持:.xlsx, .xltx, .csv + * + * 技术方案: + * - 使用 exceljs 解析 XLSX 文件 + * - 使用 papaparse 解析 CSV 文件(RFC 4180 兼容) + * - 按 Sheet 分页 + 超长内容垂直分页 + */ +export class ExcelSplitter implements ISplitter { + private readonly uploadsDir: string; + private readonly windowPool: RenderWindowPool; + private readonly tempFileManager: TempFileManager; + + constructor(uploadsDir: string) { + this.uploadsDir = uploadsDir; + this.windowPool = RenderWindowPoolFactory.create({ maxSize: 2, acquireTimeout: 60000 }); + this.tempFileManager = new TempFileManager(); + } + + /** + * 分割 Excel 文件为页面图片 + */ + async split(task: Task): Promise { + if (!task.id || !task.filename) { + throw new Error('Task ID and filename are required'); + } + + const taskId = task.id; + const filename = task.filename; + + const sourcePath = PathValidator.safePath(this.uploadsDir, taskId, filename); + + try { + await fs.access(sourcePath); + + const taskDir = ImagePathUtil.getTaskDir(taskId); + await fs.mkdir(taskDir, { recursive: true }); - const pageHeight = EXCEL_CONFIG.MAX_HEIGHT; - const totalPages = Math.ceil(totalHeight / pageHeight); + const ext = path.extname(sourcePath).toLowerCase(); + const sheets = ext === '.csv' + ? await this.parseCsv(sourcePath) + : await this.parseExcel(sourcePath); - for (let i = 0; i < totalPages; i++) { - const pageNum = startPageIndex + i + 1; - const imagePath = ImagePathUtil.getPath(taskId, pageNum); + if (sheets.length === 0) { + throw new Error('Excel file contains no data'); + } - // 滚动到对应位置 - await renderWindow.webContents.executeJavaScript( - `window.scrollTo(0, ${i * pageHeight})` - ); + // 解析 Sheet 范围 + const sheetNames = sheets.map(s => s.name); + const sheetRange = PageRangeParser.parseSheetRange(task.pageRange, sheetNames); + const selectedSheets = PageRangeParser.filterSheets(sheetNames, sheetRange); - await this.sleep(100); + const pages: PageInfo[] = []; + let pageIndex = 0; - // 截图 - const captureHeight = Math.min(pageHeight, totalHeight - i * pageHeight); - const image = await renderWindow.webContents.capturePage({ - x: 0, - y: 0, - width: dimensions.width, - height: captureHeight, - }); + for (const sheetName of selectedSheets) { + const sheet = sheets.find(s => s.name === sheetName)!; + const sheetPages = await this.renderSheet(sheet, taskId, pageIndex); - await fs.writeFile(imagePath, image.toPNG()); + for (const page of sheetPages) { + page.sheetName = sheetName; + } - pages.push({ - page: pageNum, - pageSource: pageNum, - imagePath, - }); + pages.push(...sheetPages); + pageIndex += sheetPages.length; } - return pages; + return { pages, totalPages: pages.length }; + } catch (error) { + throw this.wrapError(error, filename); } finally { - renderWindow.destroy(); + await this.tempFileManager.cleanup(); } } /** - * 解析 PPTX 文件 - * - * PPTX 是 ZIP 格式,包含 XML 文件 + * 解析 Excel 文件 */ - private async parsePptx(sourcePath: string): Promise { - const JSZip = (await import('jszip')).default; - - const data = await fs.readFile(sourcePath); - const zip = await JSZip.loadAsync(data); - - const slides: string[] = []; - - // PPTX 结构: ppt/slides/slide1.xml, slide2.xml, ... - const slideFiles = Object.keys(zip.files) - .filter(name => /^ppt\/slides\/slide\d+\.xml$/.test(name)) - .sort((a, b) => { - const numA = parseInt(a.match(/slide(\d+)/)?.[1] || '0'); - const numB = parseInt(b.match(/slide(\d+)/)?.[1] || '0'); - return numA - numB; + private async parseExcel(filePath: string): Promise { + const workbook = new ExcelJS.Workbook(); + await workbook.xlsx.readFile(filePath); + + const sheets: SheetData[] = []; + + workbook.eachSheet((worksheet) => { + const rows: string[][] = []; + let maxCol = 0; + + worksheet.eachRow((row) => { + const rowData: string[] = []; + row.eachCell({ includeEmpty: true }, (cell, colNumber) => { + rowData[colNumber - 1] = cell.text || ''; + maxCol = Math.max(maxCol, colNumber); + }); + rows.push(rowData); }); - for (const slideFile of slideFiles) { - const content = await zip.file(slideFile)?.async('string'); - if (content) { - // 从 XML 提取文本内容 - const slideHtml = this.pptxXmlToHtml(content); - slides.push(slideHtml); + // 填充空单元格 + for (const row of rows) { + while (row.length < maxCol) { + row.push(''); + } } - } - if (slides.length === 0) { - throw new Error('PowerPoint file contains no slides'); - } + if (rows.length > 0) { + sheets.push({ + name: worksheet.name, + rows, + colCount: maxCol, + }); + } + }); - return slides; + return sheets; } /** - * 将 PPTX XML 转换为 HTML + * 解析 CSV 文件(使用 papaparse) */ - private pptxXmlToHtml(xml: string): string { - // 提取文本内容( 标签) - const textRegex = /]*>([^<]*)<\/a:t>/g; - const texts: string[] = []; - let match; - - while ((match = textRegex.exec(xml)) !== null) { - if (match[1].trim()) { - texts.push(match[1]); - } + private async parseCsv(filePath: string): Promise { + const buffer = await fs.readFile(filePath); + const content = EncodingDetector.toUtf8String(buffer); + + return new Promise((resolve, reject) => { + Papa.parse(content, { + complete: (results) => { + const rows = results.data as string[][]; + + // 移除尾部空行 + while (rows.length > 0 && rows[rows.length - 1].every(cell => !cell)) { + rows.pop(); + } + + if (rows.length === 0) { + resolve([]); + return; + } + + const colCount = Math.max(...rows.map(r => r.length)); + + // 标准化列数 + for (const row of rows) { + while (row.length < colCount) { + row.push(''); + } + } + + resolve([{ + name: 'CSV Data', + rows, + colCount, + }]); + }, + error: (error) => { + reject(new Error(`Failed to parse CSV: ${error.message}`)); + }, + }); + }); + } + + /** + * 渲染单个 Sheet 为页面图片 + */ + private async renderSheet( + sheet: SheetData, + taskId: string, + startPageIndex: number + ): Promise { + const tableHtml = this.buildTableHtml(sheet.rows); + const width = Math.min( + Math.max(sheet.colCount * EXCEL_CONFIG.DEFAULT_COL_WIDTH, EXCEL_CONFIG.MIN_WIDTH), + EXCEL_CONFIG.MAX_WIDTH + ); + const fullHtml = this.buildExcelHtml(tableHtml, sheet.name, width); + + const tempHtmlPath = await this.tempFileManager.createHtmlFile(fullHtml); + const window = await this.windowPool.acquire(width, EXCEL_CONFIG.CHUNK_HEIGHT); + + const chunkedRenderer = new ChunkedRenderer({ + chunkHeight: EXCEL_CONFIG.CHUNK_HEIGHT, + deviceScaleFactor: EXCEL_CONFIG.DEVICE_SCALE_FACTOR, + pageWidth: width, + pageHeight: EXCEL_CONFIG.PAGE_HEIGHT, + }); + + try { + window.webContents.setZoomFactor(EXCEL_CONFIG.DEVICE_SCALE_FACTOR); + await this.loadAndWait(window, tempHtmlPath); + + const totalHeight = await window.webContents.executeJavaScript( + 'document.body.scrollHeight' + ); + + const pages: PageInfo[] = []; + let pageNum = startPageIndex + 1; + + const totalPages = await chunkedRenderer.renderToPages( + window, + totalHeight, + (num) => { + const imagePath = ImagePathUtil.getPath(taskId, startPageIndex + num); + pages.push({ + page: startPageIndex + num, + pageSource: startPageIndex + num, + imagePath, + }); + return imagePath; + } + ); + + return pages; + } finally { + await this.windowPool.release(window); + await this.tempFileManager.deleteFile(tempHtmlPath); } + } - // 构建简单 HTML(保持段落结构) - return texts.map(text => `

${this.escapeHtml(text)}

`).join('\n'); + /** + * 构建 HTML 表格 + */ + private buildTableHtml(rows: string[][]): string { + if (!rows.length) return '
Empty
'; + + const rowsHtml = rows.map((row, idx) => { + const cellTag = idx === 0 ? 'th' : 'td'; + const cellsHtml = row.map(cell => + `<${cellTag}>${this.escapeHtml(cell)}` + ).join(''); + return `${cellsHtml}`; + }).join('\n'); + + return `${rowsHtml}
`; } /** - * 构建 Word 文档的完整 HTML + * 构建 Excel 表格的完整 HTML */ - private buildWordHtml(content: string): string { + private buildExcelHtml(tableHtml: string, sheetName: string, width: number): string { return ` - - - ${content} - -`; - } - - /** - * 构建单张幻灯片的 HTML - */ - private buildSlideHtml(content: string, slideNumber: number): string { - return ` - - - - - -
- ${content} -
-
${slideNumber}
+
${this.escapeHtml(sheetName)}
+ ${tableHtml} `; } - /** - * 将 HTML 渲染为页面图片 - * - * 利用 Electron 的 BrowserWindow 进行渲染: - * 1. 创建隐藏的 BrowserWindow - * 2. 加载 HTML 内容 - * 3. 使用 capturePage() 截图 - * 4. 保存为 PNG 文件 - */ - private async renderHtmlToPages( - html: string, - taskId: string, - docType: DocType, - slideNumber?: number - ): Promise { - const pages: PageInfo[] = []; - - // 根据文档类型确定页面尺寸 - const isSlide = docType === DocType.POWERPOINT; - const pageWidth = isSlide ? PAGE_CONFIG.SLIDE_WIDTH : PAGE_CONFIG.PAGE_WIDTH; - const pageHeight = isSlide ? PAGE_CONFIG.SLIDE_HEIGHT : PAGE_CONFIG.PAGE_HEIGHT; - - // 创建隐藏的渲染窗口 - const renderWindow = new BrowserWindow({ - show: false, - width: pageWidth, - height: pageHeight, - webPreferences: { - nodeIntegration: false, - contextIsolation: true, - offscreen: true, - }, - }); - - try { - // 设置设备缩放因子以获得高清图片 - renderWindow.webContents.setZoomFactor(PAGE_CONFIG.DEVICE_SCALE_FACTOR); - - // 加载 HTML 内容 - const dataUrl = `data:text/html;charset=utf-8,${encodeURIComponent(html)}`; - await renderWindow.loadURL(dataUrl); - - // 等待页面渲染完成 - await this.waitForRender(renderWindow); - - if (isSlide) { - // PPT:每张幻灯片单独截图 - const pageNum = slideNumber || 1; - const imagePath = ImagePathUtil.getPath(taskId, pageNum); - - const image = await renderWindow.webContents.capturePage(); - await fs.writeFile(imagePath, image.toPNG()); - - pages.push({ - page: pageNum, - pageSource: pageNum, - imagePath, - }); - } else { - // Word:获取文档总高度,按页分割 - const totalHeight = await renderWindow.webContents.executeJavaScript( - 'document.body.scrollHeight' - ); - - const totalPages = Math.ceil(totalHeight / pageHeight); - - for (let i = 0; i < totalPages; i++) { - const pageNum = i + 1; - const imagePath = ImagePathUtil.getPath(taskId, pageNum); - - // 滚动到对应页面位置 - await renderWindow.webContents.executeJavaScript( - `window.scrollTo(0, ${i * pageHeight})` - ); - - // 等待滚动完成 - await this.sleep(100); - - // 截图 - const image = await renderWindow.webContents.capturePage({ - x: 0, - y: 0, - width: pageWidth, - height: pageHeight, - }); + private loadAndWait(window: Electron.BrowserWindow, htmlPath: string): Promise { + return new Promise((resolve, reject) => { + let timeoutId: NodeJS.Timeout; + const cleanup = () => clearTimeout(timeoutId); - await fs.writeFile(imagePath, image.toPNG()); + timeoutId = setTimeout(() => { + cleanup(); + reject(new Error('Page load timeout')); + }, 30000); - pages.push({ - page: pageNum, - pageSource: pageNum, - imagePath, - }); - } - } + window.webContents.once('did-finish-load', () => { + cleanup(); + setTimeout(resolve, 200); + }); - return pages; - } finally { - // 确保窗口被关闭 - renderWindow.destroy(); - } - } + window.webContents.once('did-fail-load', (_event, errorCode, errorDesc) => { + cleanup(); + reject(new Error(`Failed to load page: ${errorDesc} (${errorCode})`)); + }); - /** - * 等待页面渲染完成 - */ - private async waitForRender(window: BrowserWindow): Promise { - return new Promise((resolve) => { - window.webContents.on('did-finish-load', () => { - // 额外等待一小段时间确保样式渲染完成 - setTimeout(resolve, 200); + window.loadFile(htmlPath).catch((err) => { + cleanup(); + reject(err); }); }); } - /** - * HTML 转义 - */ private escapeHtml(text: string): string { return text .replace(/&/g, '&') @@ -930,53 +1905,140 @@ export class OfficeSplitter implements ISplitter { .replace(/'/g, '''); } - /** - * 错误包装 - */ - private wrapError(error: unknown, _taskId: string, filename: string): Error { + private wrapError(error: unknown, filename: string): Error { const err = error as Error; const message = err.message.toLowerCase(); - if (message.includes('enoent') || message.includes('no such file')) { - return new Error( - `Office file not found: ${filename}. The file may have been moved or deleted.` - ); + if (message.includes('security error')) return err; + if (message.includes('enoent')) { + return new Error(`Excel file not found: ${filename}`); + } + if (message.includes('invalid') || message.includes('corrupt')) { + return new Error(`Excel file appears to be corrupted: ${filename}`); } - if (message.includes('corrupt') || message.includes('invalid')) { - return new Error( - `Office file appears to be corrupted: ${filename}. Please check the file.` - ); + return new Error(`Failed to process Excel file ${filename}: ${err.message}`); + } + + async cleanup(taskId: string): Promise { + const taskDir = ImagePathUtil.getTaskDir(taskId); + await fs.rm(taskDir, { recursive: true, force: true }).catch(() => {}); + this.windowPool.destroy(); + } +} +``` + +### 10. 编码检测工具 + +```typescript +// src/core/infrastructure/adapters/split/EncodingDetector.ts +import * as chardet from 'chardet'; +import * as iconv from 'iconv-lite'; + +/** + * 支持的编码类型 + */ +export type SupportedEncoding = 'utf-8' | 'gbk' | 'gb2312' | 'gb18030'; + +/** + * 文件编码检测与转换 + * + * 用于处理 CSV 文件的编码问题 + */ +export class EncodingDetector { + private static readonly SUPPORTED_ENCODINGS: SupportedEncoding[] = [ + 'utf-8', 'gbk', 'gb2312', 'gb18030' + ]; + + /** + * 检测 Buffer 的编码 + */ + static detect(buffer: Buffer): SupportedEncoding { + const detected = chardet.detect(buffer); + + if (!detected) { + return 'utf-8'; } - if (message.includes('password') || message.includes('encrypted')) { - return new Error( - `Cannot process password-protected file: ${filename}. Please provide an unencrypted version.` - ); + const normalized = detected.toLowerCase().replace('-', ''); + + if (normalized.includes('utf8') || normalized.includes('ascii')) { + return 'utf-8'; + } + if (normalized.includes('gb18030')) { + return 'gb18030'; + } + if (normalized.includes('gbk') || normalized.includes('gb2312')) { + return 'gbk'; } - return new Error(`Failed to process Office file ${filename}: ${err.message}`); + return 'utf-8'; } /** - * 清理任务临时文件 + * 将 Buffer 转换为 UTF-8 字符串 */ - async cleanup(taskId: string): Promise { - const taskDir = ImagePathUtil.getTaskDir(taskId); + static toUtf8String(buffer: Buffer): string { + const encoding = this.detect(buffer); - try { - await fs.rm(taskDir, { recursive: true, force: true }); - } catch (error) { - console.warn(`[OfficeSplitter] Failed to cleanup task ${taskId}:`, error); + if (encoding === 'utf-8') { + return buffer.toString('utf-8'); } + + return iconv.decode(buffer, encoding); } +} +``` + +### 11. PageInfo 类型扩展 + +```typescript +// src/core/domain/split/ISplitter.ts + +export interface PageInfo { + /** 输出页码(连续编号,从 1 开始) */ + page: number; + + /** 原始页码/幻灯片编号/Sheet 索引(用于溯源) */ + pageSource: number; + + /** 图片文件路径 */ + imagePath: string; + + /** Sheet 名称(仅 Excel,可选) */ + sheetName?: string; +} + +export interface SplitResult { + pages: PageInfo[]; + totalPages: number; +} + +export interface ISplitter { + split(task: Task): Promise; + cleanup(taskId: string): Promise; +} +``` + +### 12. Task 类型扩展 + +```typescript +// src/shared/types/index.ts + +export interface Task { + id: string; + filename: string; + // ... 其他字段 /** - * 延时函数 + * 页面范围(可选) + * + * - PDF: "1-3,5" 表示原生页码 + * - Word: "1-3,5" 表示渲染后页码(注意:非原文档逻辑页码) + * - PPT: "1,3,5-7" 表示幻灯片编号 + * - Excel: "#1-2" 表示 Sheet 索引,"Sheet1,数据表" 表示 Sheet 名称 */ - private sleep(ms: number): Promise { - return new Promise((resolve) => setTimeout(resolve, ms)); - } + pageRange?: string; } ``` @@ -989,22 +2051,35 @@ export class OfficeSplitter implements ISplitter { ```json { "dependencies": { - "mammoth": "^1.6.0", - "xlsx": "^0.18.5" + "mammoth": "^1.8.0", + "jszip": "^3.10.1", + "fast-xml-parser": "^4.5.0", + "exceljs": "^4.4.0", + "papaparse": "^5.4.1", + "chardet": "^2.1.0", + "iconv-lite": "^0.6.3", + "sharp": "^0.33.0" + }, + "devDependencies": { + "@types/papaparse": "^5.3.15" } } ``` -| 依赖 | 用途 | 大小 | -|------|------|------| -| mammoth | Word 文档转 HTML | ~1.5MB | -| xlsx (SheetJS) | Excel 文件解析 | ~2MB | -| jszip | 解析 PPTX(已存在于项目中) | - | +### 依赖体积说明 -### 包大小影响 +| 依赖 | 用途 | 安装后大小 | 备注 | +|------|------|-----------|------| +| mammoth | Word 解析 | ~2 MB | 纯 JS,无原生依赖 | +| jszip | PPTX 解压 | ~200 KB | 纯 JS | +| fast-xml-parser | XML 解析 | ~300 KB | 纯 JS,高性能 | +| exceljs | Excel 解析 | ~5 MB | 支持样式、公式 | +| papaparse | CSV 解析 | ~50 KB | RFC 4180 兼容 | +| chardet | 编码检测 | ~162 KB | 纯 TypeScript | +| iconv-lite | 编码转换 | ~350 KB | 纯 JavaScript | +| sharp | 图片切分 | 7-12 MB | 包含 libvips | -- **新增依赖大小**:约 3.5MB -- **无额外运行时依赖**:复用 Electron 内置能力 +**总计:约 15-20 MB**(相比原方案减少约 35 MB,因移除了 pdfjs-dist) --- @@ -1014,128 +2089,105 @@ export class OfficeSplitter implements ISplitter { src/ ├── shared/ │ └── types/ +│ ├── index.ts # 修改:Task 增加 pageRange 字段 │ └── DocType.ts # 新增:文档类型定义 │ └── core/ + ├── domain/ + │ └── split/ + │ └── ISplitter.ts # 修改:PageInfo 增加 sheetName 字段 + │ └── infrastructure/ └── adapters/ └── split/ - ├── SplitterFactory.ts # 修改:支持 docType 分支 - ├── OfficeSplitter.ts # 新增:Office 文件分割器 - ├── PDFSplitter.ts # 保持不变 - └── ImageSplitter.ts # 保持不变 + ├── SplitterFactory.ts # 修改:支持 docType 分支 + ├── WordSplitter.ts # 新增:Word 分割器 (mammoth) + ├── PowerPointSplitter.ts # 新增:PPT 分割器 (JSZip) + ├── ExcelSplitter.ts # 新增:Excel 分割器 (exceljs) + ├── RenderWindowPoolFactory.ts # 新增:窗口池工厂 + ├── ChunkedRenderer.ts # 新增:分段截图渲染器 + ├── TempFileManager.ts # 新增:临时文件管理 + ├── PathValidator.ts # 新增:路径安全验证 + ├── EncodingDetector.ts # 新增:编码检测 + ├── PageRangeParser.ts # 新增:页面范围解析 + ├── PDFSplitter.ts # 保持不变 + └── ImageSplitter.ts # 保持不变 ``` --- -## 对比现有实现 - -| 特性 | PDFSplitter | ImageSplitter | OfficeSplitter | -|------|-------------|---------------|----------------| -| 输入格式 | PDF | JPG/PNG/WebP | DOCX/PPTX/XLSX | -| 转换方式 | pdf-to-png | fs.copyFile | HTML → capturePage | -| 分页策略 | 原生页面 | 单页 | Word 按高度 / PPT 每幻灯片 / Excel 按 Sheet | -| 页码支持 | 支持 page_range | 忽略 | 暂不支持 | -| 重试机制 | 3 次重试 | 无 | 无(可扩展) | - ---- - -## Excel 分页策略详解 - -### 挑战 +## 页面范围支持 -| 问题 | 说明 | -|------|------| -| **列数不固定** | 表格可能有 3 列或 100 列,宽度难以预设 | -| **行数不固定** | 可能几行或数万行 | -| **多 Sheet** | 一个 Excel 可能有多个工作表 | -| **合并单元格** | 复杂布局影响渲染 | +### 语义定义 -### 采用方案:按 Sheet 分页 + 智能尺寸计算 +| 文档类型 | 范围语义 | 示例 | +|----------|---------|------| +| **PDF** | 原生页码 | `1-3,5` = 第 1-3 页和第 5 页 | +| **Word** | **渲染后页码**(非原文档逻辑页码) | `1-3` = 渲染后的前 3 页截图 | +| **PowerPoint** | 幻灯片编号 | `1,3,5-7` = 第 1、3、5-7 张幻灯片 | +| **Excel** | Sheet 索引(#前缀)或名称 | `#1-2` 或 `Sheet1,数据表` | -``` -┌──────────────┐ ┌─────────────────┐ ┌────────────────────┐ -│ .xlsx 文件 │───▶│ xlsx 库解析 │───▶│ 获取 Sheet 列表 │ -└──────────────┘ └─────────────────┘ └─────────┬──────────┘ - │ - ┌────────────────────────────────┘ - ▼ -┌─────────────────────────────────────────────────────────────────┐ -│ 遍历每个 Sheet │ -├─────────────────────────────────────────────────────────────────┤ -│ 1. 获取数据范围(行数、列数) │ -│ 2. 计算渲染尺寸 │ -│ - 宽度 = min(列数 × 100px, 1600px) │ -│ - 高度 = min(行数 × 28px, 2000px) │ -│ 3. 转换为 HTML 表格(使用 sheet_to_html) │ -│ 4. 设置 BrowserWindow 尺寸 │ -│ 5. 分页截图(如果内容超高) │ -└─────────────────────────────────────────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────────────┐ -│ 输出: [Sheet1-Page1.png, Sheet1-Page2.png, Sheet2-Page1.png] │ -└─────────────────────────────────────────────────────────────────┘ -``` - -### 尺寸计算公式 +### 使用示例 ```typescript -// 宽度计算 -width = Math.max( - Math.min(colCount * 100, 1600), // 上限 1600px - 800 // 下限 800px -) - -// 高度计算(单次截图) -height = Math.min(rowCount * 28, 2000) // 上限 2000px - -// 总页数 -pages = Math.ceil(totalContentHeight / 2000) -``` +// Word: 提取渲染后的第 1-3 页 +// 注意:这是渲染后的页码,可能与原文档的逻辑页码不同 +const wordTask: Task = { + id: 'task-001', + filename: 'report.docx', + pageRange: '1-3', +}; ---- +// PPT: 提取第 1、3、5-7 张幻灯片 +const pptTask: Task = { + id: 'task-002', + filename: 'presentation.pptx', + pageRange: '1,3,5-7', +}; -## 使用示例 +// Excel: 按索引提取 Sheet(使用 # 前缀) +const excelTask1: Task = { + id: 'task-003', + filename: 'data.xlsx', + pageRange: '#1-2', // 第 1、2 个 Sheet +}; -```typescript -// 使用文档类型创建分割器 -const factory = new SplitterFactory(uploadsDir); - -// 方式 1:直接使用 DocType -const wordSplitter = factory.create(DocType.WORD); -const pptSplitter = factory.create(DocType.POWERPOINT); -const excelSplitter = factory.create(DocType.EXCEL); - -// 方式 2:从文件名自动推断 -const splitter = factory.createFromFilename('report.xlsx'); -const result = await splitter.split(task); - -// 处理结果 -console.log(`Generated ${result.totalPages} pages`); -result.pages.forEach(page => { - console.log(`Page ${page.page}: ${page.imagePath}`); -}); +// Excel: 按名称提取 Sheet(无需前缀) +const excelTask2: Task = { + id: 'task-004', + filename: 'data.xlsx', + pageRange: 'Sheet1,销售数据', // 按名称指定 +}; ``` --- -## 后续扩展 - -### 可选优化 - -1. **旧格式支持**:.doc、.ppt、.xls(非 XML 格式)需要额外库支持 -2. **图片提取**:从文档中提取嵌入图片 -3. **样式还原**:更精确的 CSS 样式映射 -4. **页码范围**:支持 page_range 参数 -5. **Excel 图表**:提取并渲染 Excel 图表 - -### 性能优化 - -1. **窗口复用**:多任务时复用 BrowserWindow -2. **并行渲染**:多页/多 Sheet 同时渲染 -3. **缓存机制**:相同文档的转换缓存 -4. **流式处理**:大文件分块读取 +## 对比方案变化 + +### 架构变化 + +| 方面 | 原方案 | 新方案 | +|------|--------|--------| +| Splitter 数量 | 1 个 OfficeSplitter | 3 个独立 Splitter | +| Word 解析 | officeparser(API 不匹配) | mammoth(成熟稳定) | +| PPT 解析 | officeparser(无 slide 结构) | JSZip + XML 解析 | +| Excel 解析 | officeparser(无 sheet 结构) | exceljs(完整 API) | +| CSV 解析 | 简单 split(有缺陷) | papaparse(RFC 4180) | +| 窗口池 | 单例模式 | 工厂模式(独立实例) | +| 内存优化 | 全页截图(可能 OOM) | 分段截图(4000px) | + +### 修复的问题 + +| 问题 | 原方案 | 新方案 | +|------|--------|--------| +| officeparser API 不匹配 | 假设存在 slides/sheets | 使用专门库 | +| 路径遍历漏洞 | 单次替换 `..` | 循环替换 + 严格验证 | +| CSV 复杂格式 | 简单 split | papaparse RFC 4180 | +| 大文档 OOM | 16000px 全页截图 | 4000px 分段截图 | +| 窗口池配置冲突 | 单例静默忽略配置 | 工厂模式独立实例 | +| 计时器泄漏 | 部分路径未清理 | 统一 cleanup 函数 | +| 窗口释放竞态 | 异步清理未等待 | await loadURL('about:blank') | +| Sheet 名称歧义 | 自动推断易出错 | # 前缀明确区分 | --- @@ -1143,5 +2195,11 @@ result.pages.forEach(page => { - Node.js: ≥ 18.0.0 - Electron: ≥ 28.0.0 -- mammoth: ≥ 1.6.0 -- xlsx: ≥ 0.18.5 +- mammoth: ≥ 1.8.0 +- jszip: ≥ 3.10.0 +- fast-xml-parser: ≥ 4.5.0 +- exceljs: ≥ 4.4.0 +- papaparse: ≥ 5.4.0 +- chardet: ≥ 2.0.0 +- iconv-lite: ≥ 0.6.3 +- sharp: ≥ 0.33.0 From acad80cf9dc13f69b01678edd91e47a662307e0d Mon Sep 17 00:00:00 2001 From: Jorben Date: Mon, 26 Jan 2026 15:06:59 +0800 Subject: [PATCH 3/3] =?UTF-8?q?docs:=20=F0=9F=93=9D=20simplify=20Excel=20S?= =?UTF-8?q?heet=20range=20to=20index-only=20format?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unify page range input format across all document types by removing name-based Sheet selection. Excel now uses the same numeric format as PDF, Word, and PowerPoint (e.g., "1-3,5" instead of "#1-2" or "Sheet1,数据表"). Changes: - Remove type and names fields from SheetRange interface - Simplify parseSheetRange to use parseNumeric directly - Update filterSheets to index-only filtering - Update documentation and examples Co-Authored-By: Claude --- docs/OFFICE_SPLITTER_DESIGN.md | 97 +++++++++------------------------- 1 file changed, 26 insertions(+), 71 deletions(-) diff --git a/docs/OFFICE_SPLITTER_DESIGN.md b/docs/OFFICE_SPLITTER_DESIGN.md index f95b443..fabebe9 100644 --- a/docs/OFFICE_SPLITTER_DESIGN.md +++ b/docs/OFFICE_SPLITTER_DESIGN.md @@ -173,7 +173,7 @@ export function isLegacyFormat(ext: string): boolean { │ 2. 获取 Sheet 列表和数据 │ │ 3. 每个 Sheet 构建 HTML 表格 │ │ 4. 分段截图 + 垂直分页 │ -│ 5. pageRange 基于 Sheet 索引/名称过滤 │ +│ 5. pageRange 基于 Sheet 索引过滤 │ └────────────────────────────────────────────────────────────────────────┘ │ @@ -267,12 +267,8 @@ export interface ParsedRange { } export interface SheetRange { - /** 解析类型 */ - type: 'indices' | 'names'; /** 按索引指定(1-based) */ - indices?: number[]; - /** 按名称指定 */ - names?: string[]; + indices: number[]; /** 原始范围字符串 */ raw: string; } @@ -280,12 +276,10 @@ export interface SheetRange { /** * 页面范围解析器 * - * 支持格式: - * - 单页: "3" + * 支持格式(所有文档类型统一): + * - 单页/单 Sheet: "3" * - 范围: "1-5" * - 混合: "1,3,5-10" - * - Sheet 索引: "#1-2" 或 "#1,3"(# 前缀明确表示索引) - * - Sheet 名称: "Sheet1,数据表"(无 # 前缀) */ export class PageRangeParser { /** @@ -349,56 +343,31 @@ export class PageRangeParser { /** * 解析 Excel Sheet 范围 * - * 支持格式: - * - 索引: "#1-2" 或 "#1,3"(# 前缀明确表示索引) - * - 名称: "Sheet1,数据表"(无 # 前缀,即使 Sheet 名是数字) + * 使用与页码相同的格式: + * - 单个: "1" + * - 范围: "1-3" + * - 混合: "1,3,5-7" * * @param range - 范围字符串 - * @param sheetNames - 可用的 Sheet 名称列表 + * @param sheetCount - Sheet 总数 */ static parseSheetRange( range: string | undefined, - sheetNames: string[] + sheetCount: number ): SheetRange { if (!range || range.trim() === '') { // 未指定,返回全部 Sheet return { - type: 'indices', - indices: Array.from({ length: sheetNames.length }, (_, i) => i + 1), + indices: Array.from({ length: sheetCount }, (_, i) => i + 1), raw: '', }; } - const trimmed = range.trim(); - - // 使用 # 前缀明确区分索引和名称 - if (trimmed.startsWith('#')) { - // 按索引解析(去掉 # 前缀) - const indexRange = trimmed.slice(1); - const parsed = this.parseNumeric(indexRange, sheetNames.length); - return { - type: 'indices', - indices: parsed.indices, - raw: range, - }; - } else { - // 按名称解析 - const names = trimmed.split(',').map(n => n.trim()).filter(Boolean); - const invalidNames = names.filter(n => !sheetNames.includes(n)); - - if (invalidNames.length > 0) { - throw new Error( - `Sheet not found: "${invalidNames.join('", "')}". ` + - `Available sheets: "${sheetNames.join('", "')}"` - ); - } - - return { - type: 'names', - names, - raw: range, - }; - } + const parsed = this.parseNumeric(range, sheetCount); + return { + indices: parsed.indices, + raw: range, + }; } /** @@ -408,15 +377,8 @@ export class PageRangeParser { sheetNames: string[], range: SheetRange ): string[] { - if (range.type === 'names' && range.names) { - // 按名称保持用户指定顺序 - return range.names; - } - if (range.type === 'indices' && range.indices) { - // 按索引过滤(1-based) - return range.indices.map(i => sheetNames[i - 1]); - } - return sheetNames; + // 按索引过滤(1-based) + return range.indices.map(i => sheetNames[i - 1]); } } ``` @@ -1626,9 +1588,9 @@ export class ExcelSplitter implements ISplitter { throw new Error('Excel file contains no data'); } - // 解析 Sheet 范围 + // 解析 Sheet 范围(使用与页码相同的格式) const sheetNames = sheets.map(s => s.name); - const sheetRange = PageRangeParser.parseSheetRange(task.pageRange, sheetNames); + const sheetRange = PageRangeParser.parseSheetRange(task.pageRange, sheets.length); const selectedSheets = PageRangeParser.filterSheets(sheetNames, sheetRange); const pages: PageInfo[] = []; @@ -2033,10 +1995,11 @@ export interface Task { /** * 页面范围(可选) * + * 所有文档类型使用统一格式: * - PDF: "1-3,5" 表示原生页码 * - Word: "1-3,5" 表示渲染后页码(注意:非原文档逻辑页码) * - PPT: "1,3,5-7" 表示幻灯片编号 - * - Excel: "#1-2" 表示 Sheet 索引,"Sheet1,数据表" 表示 Sheet 名称 + * - Excel: "1-3,5" 表示 Sheet 索引 */ pageRange?: string; } @@ -2125,7 +2088,7 @@ src/ | **PDF** | 原生页码 | `1-3,5` = 第 1-3 页和第 5 页 | | **Word** | **渲染后页码**(非原文档逻辑页码) | `1-3` = 渲染后的前 3 页截图 | | **PowerPoint** | 幻灯片编号 | `1,3,5-7` = 第 1、3、5-7 张幻灯片 | -| **Excel** | Sheet 索引(#前缀)或名称 | `#1-2` 或 `Sheet1,数据表` | +| **Excel** | Sheet 索引 | `1-3,5` = 第1、2、3、5 个Sheet | ### 使用示例 @@ -2145,21 +2108,13 @@ const pptTask: Task = { pageRange: '1,3,5-7', }; -// Excel: 按索引提取 Sheet(使用 # 前缀) +// Excel: 按索引提取 Sheet const excelTask1: Task = { id: 'task-003', filename: 'data.xlsx', - pageRange: '#1-2', // 第 1、2 个 Sheet + pageRange: '1-2', // 第 1、2 个 Sheet }; -// Excel: 按名称提取 Sheet(无需前缀) -const excelTask2: Task = { - id: 'task-004', - filename: 'data.xlsx', - pageRange: 'Sheet1,销售数据', // 按名称指定 -}; -``` - --- ## 对比方案变化 @@ -2187,7 +2142,7 @@ const excelTask2: Task = { | 窗口池配置冲突 | 单例静默忽略配置 | 工厂模式独立实例 | | 计时器泄漏 | 部分路径未清理 | 统一 cleanup 函数 | | 窗口释放竞态 | 异步清理未等待 | await loadURL('about:blank') | -| Sheet 名称歧义 | 自动推断易出错 | # 前缀明确区分 | +| Sheet 范围格式 | 名称/索引混合易歧义 | 仅索引,与其他类型统一 | ---