From 531f7e3e0e4ac3d22cfe2dd62010e8b23cb9dc8b Mon Sep 17 00:00:00 2001 From: rongfeng Date: Sun, 8 Feb 2026 01:40:28 +0800 Subject: [PATCH 1/6] feat: add PaperReader v0.2 plugin - universal document recursive reader New plugin: Plugin/PaperReader - MinerU cloud API for high-fidelity PDF parsing (formulas/tables/images/OCR) - Auto-degradation to pdf-parse when MinerU unavailable - Section-aware chunking with Meta-Header injection - Rolling Context deep reading with auto-compression - Keyword-based query with section weighting - Supports academic papers, technical reports, books, legal documents, etc. --- Plugin/PaperReader/PaperReader.js | 174 +++++++++++++++ Plugin/PaperReader/README.md | 68 ++++++ Plugin/PaperReader/config.env.example | 25 +++ Plugin/PaperReader/lib/chunker.js | 183 ++++++++++++++++ Plugin/PaperReader/lib/deep-reader.js | 174 +++++++++++++++ Plugin/PaperReader/lib/ingest.js | 41 ++++ Plugin/PaperReader/lib/llm.js | 72 +++++++ Plugin/PaperReader/lib/mineru-client.js | 213 +++++++++++++++++++ Plugin/PaperReader/lib/pdf-parse-fallback.js | 35 +++ Plugin/PaperReader/lib/query.js | 108 ++++++++++ Plugin/PaperReader/lib/skeleton.js | 161 ++++++++++++++ Plugin/PaperReader/plugin-manifest.json | 75 +++++++ 12 files changed, 1329 insertions(+) create mode 100644 Plugin/PaperReader/PaperReader.js create mode 100644 Plugin/PaperReader/README.md create mode 100644 Plugin/PaperReader/config.env.example create mode 100644 Plugin/PaperReader/lib/chunker.js create mode 100644 Plugin/PaperReader/lib/deep-reader.js create mode 100644 Plugin/PaperReader/lib/ingest.js create mode 100644 Plugin/PaperReader/lib/llm.js create mode 100644 Plugin/PaperReader/lib/mineru-client.js create mode 100644 Plugin/PaperReader/lib/pdf-parse-fallback.js create mode 100644 Plugin/PaperReader/lib/query.js create mode 100644 Plugin/PaperReader/lib/skeleton.js create mode 100644 Plugin/PaperReader/plugin-manifest.json diff --git a/Plugin/PaperReader/PaperReader.js b/Plugin/PaperReader/PaperReader.js new file mode 100644 index 00000000..9b3b522d --- /dev/null +++ b/Plugin/PaperReader/PaperReader.js @@ -0,0 +1,174 @@ +/** + * PaperReader v0.2 — 主入口 + * + * stdin 接收 JSON → 路由到各 command handler → stdout 输出 JSON + */ + +const fs = require('fs').promises; +const fsSync = require('fs'); +const path = require('path'); +const crypto = require('crypto'); + +require('dotenv').config({ path: path.join(__dirname, 'config.env') }); +require('dotenv').config({ path: path.join(__dirname, '..', '..', 'config.env') }); + +const { ingestPdf } = require('./lib/ingest'); +const { chunkMarkdown } = require('./lib/chunker'); +const { generateSkeleton } = require('./lib/skeleton'); +const { readDeep } = require('./lib/deep-reader'); +const { queryPaper } = require('./lib/query'); + +const WORKSPACE_ROOT = path.join(__dirname, 'workspace'); + +function sendResponse(data) { + process.stdout.write(JSON.stringify(data)); + process.exit(0); +} + +function sha1(input) { + return crypto.createHash('sha1').update(input).digest('hex'); +} + +function getPaperWorkspace(paperId) { + return path.join(WORKSPACE_ROOT, paperId); +} + +async function writeJson(filePath, obj) { + await fs.writeFile(filePath, JSON.stringify(obj, null, 2), 'utf-8'); +} + +// ─── Command Handlers ─── + +async function handleIngestPDF({ filePath, paperId }) { + if (!filePath || typeof filePath !== 'string') { + throw new Error('IngestPDF requires filePath'); + } + + const abs = path.isAbsolute(filePath) ? filePath : path.resolve(process.cwd(), filePath); + if (!fsSync.existsSync(abs)) { + throw new Error(`PDF not found: ${abs}`); + } + + const resolvedPaperId = paperId && String(paperId).trim() + ? String(paperId).trim() + : `paper-${sha1(abs).slice(0, 10)}`; + + const wsDir = getPaperWorkspace(resolvedPaperId); + await fs.mkdir(wsDir, { recursive: true }); + + // L0: 解析 PDF → Markdown + Figures + const parsed = await ingestPdf(abs, { outputDir: wsDir }); + + // Save meta + const meta = { + paperId: resolvedPaperId, + sourceFilePath: abs, + extractedAt: new Date().toISOString(), + pageCount: parsed.pageCount, + textLength: (parsed.markdown || '').length, + engine: parsed.engine + }; + await writeJson(path.join(wsDir, 'meta.json'), meta); + + // Save full markdown + await fs.writeFile(path.join(wsDir, 'full_text.md'), parsed.markdown || '', 'utf-8'); + + // Save figure map + if (parsed.figureMap && parsed.figureMap.length > 0) { + await writeJson(path.join(wsDir, 'figure_map.json'), parsed.figureMap); + } + + // L1: 章节感知切分 + const chunks = chunkMarkdown(parsed.markdown || ''); + + // Save chunks + const chunksDir = path.join(wsDir, 'chunks'); + await fs.mkdir(chunksDir, { recursive: true }); + + for (const chunk of chunks) { + const chunkContent = chunk.metaHeader + ? `${chunk.metaHeader}\n\n---\n\n${chunk.text}` + : chunk.text; + await fs.writeFile( + path.join(chunksDir, `chunk_${chunk.index}.md`), + chunkContent, + 'utf-8' + ); + } + + // Save manifest + const manifest = { + chunkCount: chunks.length, + chunks: chunks.map(c => ({ + index: c.index, + section: c.section, + tokenCount: c.tokenCount + })) + }; + await writeJson(path.join(chunksDir, 'manifest.json'), manifest); + + // Create reading_notes dir + await fs.mkdir(path.join(wsDir, 'reading_notes'), { recursive: true }); + + return { + paperId: resolvedPaperId, + workspace: wsDir, + pageCount: meta.pageCount, + chunkCount: chunks.length, + engine: parsed.engine + }; +} + +async function handleReadSkeleton({ paperId, focus }) { + if (!paperId) throw new Error('ReadSkeleton requires paperId'); + const result = await generateSkeleton(paperId, { focus }); + return { paperId, globalMapPath: result.globalMapPath }; +} + +async function handleReadDeep({ paperId, goal }) { + if (!paperId) throw new Error('ReadDeep requires paperId'); + return await readDeep(paperId, { goal }); +} + +async function handleQuery({ paperId, question }) { + return await queryPaper(paperId, question); +} + +// ─── Main ─── + +async function main() { + let inputData = ''; + process.stdin.setEncoding('utf8'); + for await (const chunk of process.stdin) inputData += chunk; + + const request = JSON.parse(inputData || '{}'); + const command = request.command; + + try { + if (!command) throw new Error('Missing command'); + + let result; + switch (command) { + case 'IngestPDF': + result = await handleIngestPDF({ filePath: request.filePath, paperId: request.paperId }); + break; + case 'ReadSkeleton': + result = await handleReadSkeleton({ paperId: request.paperId, focus: request.focus }); + break; + case 'ReadDeep': + result = await handleReadDeep({ paperId: request.paperId, goal: request.goal }); + break; + case 'Query': + result = await handleQuery({ paperId: request.paperId, question: request.question }); + break; + default: + throw new Error(`Unknown command: ${command}`); + } + + sendResponse({ status: 'success', result }); + } catch (err) { + sendResponse({ status: 'error', error: err?.message || String(err) }); + } +} + +main(); diff --git a/Plugin/PaperReader/README.md b/Plugin/PaperReader/README.md new file mode 100644 index 00000000..327290bd --- /dev/null +++ b/Plugin/PaperReader/README.md @@ -0,0 +1,68 @@ +# PaperReader(v0.2) + +## 设计目标 + +将超长 PDF / 文档转为可控的递归阅读流程。适用于学术论文、技术报告、法律文书、书籍章节等各类长文档。 + +1. **L0 解析层**:MinerU 云端 API 高保真解析(保留公式/表格/图片/多栏排版),自动降级到 pdf-parse +2. **L1 切分层**:章节感知切分 + Meta-Header 注入 + 10-20% overlap +3. **L2 递归逻辑层**:Skeleton 骨架提取 / Rolling Context 深度阅读 / 合并综合 +4. **L3 存储交互层**:Obsidian 友好的 Markdown 目录结构 + +## 命令 + +| 命令 | 功能 | +|------|------| +| `IngestPDF` | PDF → Markdown → 章节感知 chunks | +| `ReadSkeleton` | 从目录/摘要/关键章节生成 Global Map | +| `ReadDeep` | 带 Rolling Context 的递归摘要 → Round-1 笔记 | +| `Query` | 检索式问答(关键词匹配 + 章节权重) | + +## 工件目录 + +``` +workspace/{paperId}/ +├── meta.json # 元数据(含解析引擎标识) +├── full_text.md # 完整 Markdown(L0 输出) +├── figure_map.json # Figure_ID ↔ Caption 映射 +├── assets/ +│ └── figures/ # 提取的图片 +├── chunks/ +│ ├── manifest.json # chunk 清单 + 章节映射 +│ └── chunk_{i}.md # 单个 chunk(含 Meta-Header) +└── reading_notes/ + ├── Global_Map.md # 骨架地图 + ├── Chunk_Summaries.json # 分块摘要 + └── Round_1_Summary.md # 深度笔记 +``` + +## 配置 + +复制 `config.env.example` 为 `config.env` 并填入: +- `MINERU_API_TOKEN`:MinerU 云端 API Token(不填则自动降级) +- `PaperReaderModel`:LLM 模型名称 +- 详见 `config.env.example` 中的完整配置项 + +## 依赖 + +- `axios`:HTTP 请求 +- `pdf-parse`:降级模式 PDF 解析 +- `adm-zip`:解压 MinerU 返回的 zip +- `@dqbd/tiktoken`:token 计数 +- `dotenv`:环境变量 + +## 支持的文档类型 + +MinerU 云端 API 支持解析: +- 学术论文(多栏、公式、引用) +- 技术报告 / 白皮书 +- 书籍章节 +- 法律文书 / 合同 +- 扫描版 PDF(内置 OCR) +- 含复杂表格的文档 + +## 常见限制 + +- MinerU 免费额度:每日 2000 页,单文件 200MB/600 页 +- Rolling Context 上限 4000 tokens,超出自动压缩 +- Query 目前为关键词匹配(向量检索计划在 Phase 2) diff --git a/Plugin/PaperReader/config.env.example b/Plugin/PaperReader/config.env.example new file mode 100644 index 00000000..fbb51b8a --- /dev/null +++ b/Plugin/PaperReader/config.env.example @@ -0,0 +1,25 @@ +# PaperReader 插件配置(示例) + +# === L0 解析层 === +# MinerU 云端 API Token(从 mineru.net 获取) +MINERU_API_TOKEN= +# 轮询超时(ms),默认 5 分钟 +MINERU_API_TIMEOUT=300000 +# 轮询间隔(ms),默认 5 秒 +MINERU_POLL_INTERVAL=5000 + +# === L1 切分层 === +# 目标 chunk 大小(tokens) +PaperReaderChunkSize=2000 +# chunk 重叠比例 +PaperReaderOverlap=0.15 + +# === L2 递归逻辑层 === +# 读取/总结模型(使用 VCP 的 API_URL/API_Key 调用 /v1/chat/completions) +PaperReaderModel=gemini-2.5-flash-search +# 单次模型输出 token 上限 +PaperReaderMaxOutputTokens=12000 +# 分批并发组大小(每组处理的 chunk 数) +PaperReaderBatchSize=4 +# deep 阅读最多处理多少个 chunk(防止成本失控) +PaperReaderMaxChunks=120 diff --git a/Plugin/PaperReader/lib/chunker.js b/Plugin/PaperReader/lib/chunker.js new file mode 100644 index 00000000..2b2f61c8 --- /dev/null +++ b/Plugin/PaperReader/lib/chunker.js @@ -0,0 +1,183 @@ +/** + * 章节感知切分器 (T3) + * + * 按 Markdown 章节标题(##)切分,超长章节在段落边界二次切分。 + * 每个 chunk 注入 Meta-Header(章节名 + 全局摘要占位 + overlap)。 + * 使用 tiktoken cl100k_base 计算 token 数。 + */ + +const { get_encoding } = require('@dqbd/tiktoken'); + +const encoding = get_encoding('cl100k_base'); + +const DEFAULT_TARGET_TOKENS = 2000; +const DEFAULT_OVERLAP_RATIO = 0.15; +const DEFAULT_MAX_CHUNKS = 120; + +/** + * 计算文本的 token 数 + */ +function countTokens(text) { + if (!text) return 0; + return encoding.encode(text).length; +} + +/** + * 从 Markdown 中提取章节结构 + * @returns {Array<{ level: number, title: string, content: string }>} + */ +function extractSections(markdown) { + const lines = markdown.split('\n'); + const sections = []; + let currentSection = { level: 0, title: '(Preamble)', lines: [] }; + + for (const line of lines) { + const headerMatch = line.match(/^(#{1,4})\s+(.+)$/); + if (headerMatch) { + // Save previous section + if (currentSection.lines.length > 0 || currentSection.title !== '(Preamble)') { + sections.push({ + level: currentSection.level, + title: currentSection.title, + content: currentSection.lines.join('\n') + }); + } + currentSection = { + level: headerMatch[1].length, + title: headerMatch[2].trim(), + lines: [line] + }; + } else { + currentSection.lines.push(line); + } + } + + // Push last section + if (currentSection.lines.length > 0) { + sections.push({ + level: currentSection.level, + title: currentSection.title, + content: currentSection.lines.join('\n') + }); + } + + return sections; +} + +/** + * 在段落边界切分超长文本 + * @returns {string[]} + */ +function splitAtParagraphs(text, targetTokens) { + const paragraphs = text.split(/\n\n+/); + const pieces = []; + let current = ''; + let currentTokens = 0; + + for (const para of paragraphs) { + const paraTokens = countTokens(para); + + if (currentTokens + paraTokens > targetTokens && current.trim()) { + pieces.push(current.trim()); + current = ''; + currentTokens = 0; + } + + // Handle single paragraph exceeding limit + if (paraTokens > targetTokens && !current.trim()) { + const sentences = para.split(/(?<=[。?!.!?\n])/g); + for (const sent of sentences) { + const sentTokens = countTokens(sent); + if (currentTokens + sentTokens > targetTokens && current.trim()) { + pieces.push(current.trim()); + current = ''; + currentTokens = 0; + } + current += sent; + currentTokens += sentTokens; + } + continue; + } + + current += (current ? '\n\n' : '') + para; + currentTokens += paraTokens; + } + + if (current.trim()) { + pieces.push(current.trim()); + } + + return pieces; +} + +/** + * 生成 Meta-Header + */ +function makeMetaHeader(section, globalSummary, overlapText) { + const parts = [`[章节: ${section}]`]; + if (globalSummary) { + parts.push(`[全局摘要: ${globalSummary}]`); + } + if (overlapText) { + parts.push(`[上文衔接: ...${overlapText.slice(-200)}]`); + } + return parts.join('\n'); +} + +/** + * 章节感知切分 + * + * @param {string} markdown - L0 输出的 Markdown + * @param {object} options - { targetTokens, overlapRatio, maxChunks, globalSummary } + * @returns {Array<{ index, section, tokenCount, text, metaHeader }>} + */ +function chunkMarkdown(markdown, options = {}) { + const targetTokens = options.targetTokens || DEFAULT_TARGET_TOKENS; + const overlapRatio = options.overlapRatio || DEFAULT_OVERLAP_RATIO; + const maxChunks = options.maxChunks || DEFAULT_MAX_CHUNKS; + const globalSummary = options.globalSummary || ''; + + if (!markdown || !markdown.trim()) return []; + + const sections = extractSections(markdown); + const chunks = []; + let prevTail = ''; + + for (const section of sections) { + const sectionTokens = countTokens(section.content); + + if (sectionTokens <= targetTokens) { + const metaHeader = makeMetaHeader(section.title, globalSummary, prevTail); + const text = section.content; + chunks.push({ + index: chunks.length, + section: section.title, + tokenCount: countTokens(metaHeader + '\n\n' + text), + text, + metaHeader + }); + const tailLen = Math.floor(text.length * overlapRatio); + prevTail = text.slice(-tailLen); + } else { + const pieces = splitAtParagraphs(section.content, targetTokens); + for (const piece of pieces) { + const metaHeader = makeMetaHeader(section.title, globalSummary, prevTail); + chunks.push({ + index: chunks.length, + section: section.title, + tokenCount: countTokens(metaHeader + '\n\n' + piece), + text: piece, + metaHeader + }); + const tailLen = Math.floor(piece.length * overlapRatio); + prevTail = piece.slice(-tailLen); + } + } + + if (chunks.length >= maxChunks) break; + } + + return chunks.slice(0, maxChunks); +} + +module.exports = { chunkMarkdown, countTokens, extractSections }; diff --git a/Plugin/PaperReader/lib/deep-reader.js b/Plugin/PaperReader/lib/deep-reader.js new file mode 100644 index 00000000..05ab5890 --- /dev/null +++ b/Plugin/PaperReader/lib/deep-reader.js @@ -0,0 +1,174 @@ +/** + * Rolling Context Deep Reader (T6) + * + * 带滚动上下文的深度阅读:每个 chunk 摘要时携带前序累积的关键事实, + * 保持 chunk 间的连贯性。超出上限时自动压缩。 + */ + +const fs = require('fs').promises; +const fsSync = require('fs'); +const path = require('path'); +const { callLLM, callLLMJson } = require('./llm'); +const { countTokens } = require('./chunker'); + +const WORKSPACE_ROOT = path.join(__dirname, '..', 'workspace'); +const BATCH_SIZE = parseInt(process.env.PaperReaderBatchSize || '4', 10); +const MAX_CHUNKS = parseInt(process.env.PaperReaderMaxChunks || '120', 10); +const ROLLING_CONTEXT_MAX_TOKENS = 4000; + +/** + * 压缩 Rolling Context(当超过上限时) + */ +async function compressContext(rollingContext) { + const compressed = await callLLM([ + { role: 'system', content: '将以下累积的阅读笔记压缩为关键事实列表,保留最重要的信息、关键步骤和核心结论。删除冗余和过渡性描述。输出纯文本,不超过 2000 tokens。' }, + { role: 'user', content: rollingContext } + ], { max_tokens: 3000, temperature: 0.1 }); + return compressed; +} + +/** + * 对单个 chunk 做摘要(携带 Rolling Context) + */ +async function summarizeChunk(chunkText, { goal, globalMap, rollingContext, chunkIndex, section }) { + const system = [ + '你是一个"长文档分块摘要器",适用于各类文档(学术论文、技术报告、书籍、法律文书等)。', + '你会结合已有的阅读上下文,对当前 chunk 进行摘要。', + '输出 JSON(纯 JSON,不要代码块):', + '{"summary": string, "key_facts": string[], "methods": string[], "claims": string[], "open_questions": string[]}', + '其中 methods 字段可包含任何流程/步骤/操作方法(不限于科研实验),claims 包含文档中的核心论断/条款/规定。' + ].join('\n'); + + const userParts = [ + `主任务目标:${goal || '全面理解文档核心内容'}`, + `当前位置:第 ${chunkIndex} 块,章节「${section}」` + ]; + + if (rollingContext) { + userParts.push(`【已有阅读上下文】\n${rollingContext}`); + } + if (globalMap) { + userParts.push(`【全局地图摘要】\n${globalMap.slice(0, 2000)}`); + } + userParts.push(`【当前 chunk 内容】\n${chunkText}`); + + const result = await callLLMJson([ + { role: 'system', content: system }, + { role: 'user', content: userParts.join('\n\n') } + ], { temperature: 0.1 }); + + // Normalize result + return { + summary: result.summary || result.raw_response || '', + key_facts: result.key_facts || [], + methods: result.methods || [], + claims: result.claims || [], + open_questions: result.open_questions || [] + }; +} + +/** + * 带滚动上下文的深度阅读 + * + * @param {string} paperId + * @param {object} options - { goal, batchSize, maxChunks } + * @returns {Promise<{ summariesPath, roundPath }>} + */ +async function readDeep(paperId, options = {}) { + const wsDir = path.join(WORKSPACE_ROOT, paperId); + const chunksDir = path.join(wsDir, 'chunks'); + const manifestPath = path.join(chunksDir, 'manifest.json'); + + if (!fsSync.existsSync(manifestPath)) { + throw new Error(`chunks/manifest.json not found: ${manifestPath}`); + } + + const manifest = JSON.parse(await fs.readFile(manifestPath, 'utf-8')); + const chunks = manifest.chunks || []; + + // Load Global Map if exists + const globalMapPath = path.join(wsDir, 'reading_notes', 'Global_Map.md'); + const globalMap = fsSync.existsSync(globalMapPath) + ? await fs.readFile(globalMapPath, 'utf-8') + : ''; + + const batchSize = options.batchSize || BATCH_SIZE; + const maxChunks = Math.min(options.maxChunks || MAX_CHUNKS, chunks.length); + const goal = options.goal || ''; + + const limited = chunks.slice(0, maxChunks); + const summaries = []; + let rollingContext = ''; + + // Sequential processing with Rolling Context + for (let i = 0; i < limited.length; i += batchSize) { + const batch = limited.slice(i, i + batchSize); + + for (const chunk of batch) { + const chunkPath = path.join(chunksDir, `chunk_${chunk.index}.md`); + let chunkText; + if (fsSync.existsSync(chunkPath)) { + chunkText = await fs.readFile(chunkPath, 'utf-8'); + } else { + chunkText = chunk.text || ''; + } + + const summary = await summarizeChunk(chunkText, { + goal, + globalMap, + rollingContext, + chunkIndex: chunk.index, + section: chunk.section || 'unknown' + }); + + summaries.push({ + chunkIndex: chunk.index, + section: chunk.section, + ...summary + }); + + // Update Rolling Context + const newFacts = summary.key_facts.join('; '); + if (newFacts) { + rollingContext += `\n[Chunk ${chunk.index} - ${chunk.section}]: ${newFacts}`; + } + + // Compress if exceeding limit + if (countTokens(rollingContext) > ROLLING_CONTEXT_MAX_TOKENS) { + rollingContext = await compressContext(rollingContext); + } + } + } + + // Save chunk summaries + const notesDir = path.join(wsDir, 'reading_notes'); + await fs.mkdir(notesDir, { recursive: true }); + const summariesPath = path.join(notesDir, 'Chunk_Summaries.json'); + await fs.writeFile(summariesPath, JSON.stringify({ count: summaries.length, summaries }, null, 2), 'utf-8'); + + // Synthesis: merge all summaries into Round_1_Summary.md + const system = [ + '你是一个"长文档合并器",适用于各类文档。', + '输入是多段 chunk 的结构化摘要(含滚动上下文),请合并成一份结构化的深度笔记。', + '输出 Markdown,根据文档类型自适应包含:核心主题与结论、关键内容与论点、方法/流程/步骤(如有)、重要数据与证据、局限与风险、待解决问题清单。' + ].join('\n'); + + const user = [ + `主任务目标:${goal || '全面理解文档核心内容'}`, + globalMap ? `全局地图:\n${globalMap.slice(0, 3000)}` : '', + `最终累积上下文:\n${rollingContext}`, + `Chunk 摘要(${summaries.length} 个):\n${JSON.stringify(summaries).slice(0, 150000)}` + ].filter(Boolean).join('\n\n'); + + const merged = await callLLM([ + { role: 'system', content: system }, + { role: 'user', content: user } + ], { temperature: 0.2 }); + + const roundPath = path.join(notesDir, 'Round_1_Summary.md'); + await fs.writeFile(roundPath, merged || '', 'utf-8'); + + return { paperId, summariesPath, roundPath }; +} + +module.exports = { readDeep }; diff --git a/Plugin/PaperReader/lib/ingest.js b/Plugin/PaperReader/lib/ingest.js new file mode 100644 index 00000000..56374ad9 --- /dev/null +++ b/Plugin/PaperReader/lib/ingest.js @@ -0,0 +1,41 @@ +/** + * 统一解析入口 (T1+T2) + * + * 优先使用 MinerU 云端 API,失败则自动降级到 pdf-parse。 + */ + +const path = require('path'); +const fs = require('fs').promises; +const mineruClient = require('./mineru-client'); +const fallback = require('./pdf-parse-fallback'); + +/** + * 统一解析入口:优先 MinerU,失败则降级 + * + * @param {string} pdfPath - PDF 绝对路径 + * @param {object} options - { outputDir, token, timeout, pollInterval } + * @returns {Promise<{ markdown, figures, pageCount, figureMap, engine: 'mineru'|'pdf-parse' }>} + */ +async function ingestPdf(pdfPath, options = {}) { + const outputDir = options.outputDir || path.dirname(pdfPath); + const hasMineruToken = !!(options.token || process.env.MINERU_API_TOKEN); + + if (hasMineruToken) { + try { + const result = await mineruClient.parsePdf(pdfPath, { ...options, outputDir }); + return { ...result, engine: 'mineru' }; + } catch (err) { + // Log degradation warning, then fall through to pdf-parse + const errMsg = err instanceof mineruClient.MineruError + ? `[MinerU ${err.code}] ${err.message}` + : `[MinerU Error] ${err.message}`; + process.stderr.write(`[PaperReader] MinerU failed, degrading to pdf-parse: ${errMsg}\n`); + } + } + + // Fallback to pdf-parse + const result = await fallback.parsePdf(pdfPath); + return { ...result, engine: 'pdf-parse' }; +} + +module.exports = { ingestPdf }; diff --git a/Plugin/PaperReader/lib/llm.js b/Plugin/PaperReader/lib/llm.js new file mode 100644 index 00000000..abe09376 --- /dev/null +++ b/Plugin/PaperReader/lib/llm.js @@ -0,0 +1,72 @@ +/** + * LLM 调用封装 (T4) + * + * 从 PaperReader.js 抽出,统一管理模型调用。 + */ + +const axios = require('axios'); +const path = require('path'); + +require('dotenv').config({ path: path.join(__dirname, '..', 'config.env') }); +require('dotenv').config({ path: path.join(__dirname, '..', '..', '..', 'config.env') }); + +const API_KEY = process.env.API_Key; +const API_URL = process.env.API_URL; +const MODEL = process.env.PaperReaderModel; +const MAX_OUTPUT_TOKENS = parseInt(process.env.PaperReaderMaxOutputTokens || '12000', 10); + +function ensureConfig() { + if (!API_KEY || !API_URL) { + throw new Error('Missing API config: API_Key/API_URL are required (from repo root config.env).'); + } + if (!MODEL) { + throw new Error('Missing PaperReaderModel in config.env'); + } +} + +/** + * 调用 LLM (OpenAI-compatible API) + * + * @param {Array<{role: string, content: string}>} messages + * @param {object} options - { max_tokens, temperature } + * @returns {Promise} 模型输出文本 + */ +async function callLLM(messages, { max_tokens = MAX_OUTPUT_TOKENS, temperature = 0.2 } = {}) { + ensureConfig(); + + const payload = { + model: MODEL, + messages, + stream: false, + max_tokens, + temperature + }; + + const resp = await axios.post(API_URL, payload, { + headers: { Authorization: `Bearer ${API_KEY}`, 'Content-Type': 'application/json' }, + timeout: 180000 + }); + + return resp?.data?.choices?.[0]?.message?.content || ''; +} + +/** + * 调用 LLM 并解析 JSON 响应 + * + * @param {Array} messages + * @param {object} options + * @returns {Promise} 解析后的 JSON 对象 + */ +async function callLLMJson(messages, options = {}) { + const raw = await callLLM(messages, { ...options, temperature: options.temperature ?? 0.1 }); + try { + // 尝试从 markdown 代码块中提取 JSON + const jsonMatch = raw.match(/```(?:json)?\s*([\s\S]*?)```/); + const jsonStr = jsonMatch ? jsonMatch[1].trim() : raw.trim(); + return JSON.parse(jsonStr); + } catch { + return { raw_response: raw }; + } +} + +module.exports = { callLLM, callLLMJson }; diff --git a/Plugin/PaperReader/lib/mineru-client.js b/Plugin/PaperReader/lib/mineru-client.js new file mode 100644 index 00000000..f0ceb760 --- /dev/null +++ b/Plugin/PaperReader/lib/mineru-client.js @@ -0,0 +1,213 @@ +/** + * MinerU Cloud API 适配器 (T1) + * + * 流程: 获取上传URL → PUT上传PDF → 轮询batch结果 → 下载zip → 提取md+figures + */ + +const fs = require('fs').promises; +const path = require('path'); +const axios = require('axios'); + +const MINERU_API_BASE = 'https://mineru.net/api/v4'; + +class MineruError extends Error { + constructor(code, message) { + super(message); + this.name = 'MineruError'; + this.code = code; + } +} + +/** + * 获取预签名上传URL + */ +async function getUploadUrl(token, fileName) { + const resp = await axios.post(`${MINERU_API_BASE}/file-urls/batch`, { + files: [{ name: fileName, data_id: `pr_${Date.now()}` }], + model_version: 'hybrid-auto-engine' + }, { + headers: { + 'Content-Type': 'application/json', + 'Authorization': `Bearer ${token}` + }, + timeout: 30000 + }); + + const data = resp.data; + if (data.code !== 0) { + throw new MineruError('MINERU_AUTH_FAILED', `MinerU API error: code=${data.code}, msg=${data.msg || ''}`); + } + + return { + uploadUrl: data.data.file_urls[0], + batchId: data.data.batch_id + }; +} + +/** + * PUT 上传文件到预签名URL + */ +async function uploadFile(uploadUrl, filePath) { + const fileBuffer = await fs.readFile(filePath); + await axios.put(uploadUrl, fileBuffer, { + headers: { 'Content-Type': 'application/octet-stream' }, + timeout: 120000, + maxContentLength: 200 * 1024 * 1024 + }); +} + +/** + * 轮询batch结果 + */ +async function pollBatchResult(token, batchId, { timeout = 300000, pollInterval = 5000 } = {}) { + const startTime = Date.now(); + const url = `${MINERU_API_BASE}/extract-results/batch/${batchId}`; + + while (Date.now() - startTime < timeout) { + const resp = await axios.get(url, { + headers: { 'Authorization': `Bearer ${token}` }, + timeout: 15000 + }); + + const data = resp.data; + if (data.code !== 0) { + throw new MineruError('MINERU_PARSE_FAILED', `Batch poll failed: code=${data.code}`); + } + + const results = data.data?.extract_result || []; + if (results.length > 0) { + const first = results[0]; + if (first.state === 'done') { + return first; + } + if (first.state === 'failed') { + throw new MineruError('MINERU_PARSE_FAILED', `Batch task failed: ${first.err_msg || 'unknown'}`); + } + } + + await new Promise(r => setTimeout(r, pollInterval)); + } + + throw new MineruError('MINERU_TIMEOUT', `Batch polling timeout after ${timeout}ms`); +} + +/** + * 下载并解压结果zip,提取markdown和图片 + */ +async function downloadAndExtract(zipUrl, outputDir) { + const AdmZip = require('adm-zip'); + + const resp = await axios.get(zipUrl, { + responseType: 'arraybuffer', + timeout: 120000 + }); + + const zip = new AdmZip(resp.data); + const entries = zip.getEntries(); + + let markdown = ''; + const figures = []; + + const figuresDir = path.join(outputDir, 'assets', 'figures'); + await fs.mkdir(figuresDir, { recursive: true }); + + for (const entry of entries) { + const entryName = entry.entryName; + + if (entryName.endsWith('.md') && !entry.isDirectory) { + markdown = entry.getData().toString('utf-8'); + } else if (/\.(png|jpg|jpeg|gif|svg|webp)$/i.test(entryName) && !entry.isDirectory) { + const figName = path.basename(entryName); + const figPath = path.join(figuresDir, figName); + await fs.writeFile(figPath, entry.getData()); + figures.push({ + id: figName.replace(/\.[^.]+$/, ''), + path: `assets/figures/${figName}`, + filename: figName + }); + } + } + + return { markdown, figures }; +} + +/** + * 从markdown中提取figure caption映射 + */ +function extractFigureCaptions(markdown) { + const captions = []; + // 匹配 ![caption](path) 模式 + const imgRegex = /!\[([^\]]*)\]\(([^)]+)\)/g; + let match; + while ((match = imgRegex.exec(markdown)) !== null) { + captions.push({ + caption: match[1], + originalPath: match[2], + id: path.basename(match[2]).replace(/\.[^.]+$/, '') + }); + } + + // 匹配 "Figure X." 或 "Fig. X:" 开头的段落 + const figTextRegex = /^(Fig(?:ure)?\.?\s*\d+[.:]\s*)(.+)$/gm; + while ((match = figTextRegex.exec(markdown)) !== null) { + captions.push({ + caption: match[2].trim(), + label: match[1].trim(), + id: `fig_text_${captions.length}` + }); + } + + return captions; +} + +/** + * 完整流程:上传 PDF → 提交解析 → 轮询 → 返回结果 + * + * @param {string} pdfPath - PDF 绝对路径 + * @param {object} options - { token, timeout, pollInterval, outputDir } + * @returns {Promise<{ markdown: string, figures: Array, pageCount: number, figureMap: Array }>} + */ +async function parsePdf(pdfPath, options = {}) { + const token = options.token || process.env.MINERU_API_TOKEN; + if (!token) { + throw new MineruError('MINERU_AUTH_FAILED', 'MINERU_API_TOKEN is required'); + } + + const timeout = options.timeout || parseInt(process.env.MINERU_API_TIMEOUT || '300000', 10); + const pollInterval = options.pollInterval || parseInt(process.env.MINERU_POLL_INTERVAL || '5000', 10); + + const fileName = path.basename(pdfPath); + const outputDir = options.outputDir || path.dirname(pdfPath); + + // Step 1: 获取上传URL + const { uploadUrl, batchId } = await getUploadUrl(token, fileName); + + // Step 2: 上传文件 + await uploadFile(uploadUrl, pdfPath); + + // Step 3: 轮询batch结果 (file-urls/batch 自动创建解析任务) + const batchResult = await pollBatchResult(token, batchId, { timeout, pollInterval }); + + // Step 4: 下载并解压结果 + const zipUrl = batchResult.full_zip_url; + if (!zipUrl) { + throw new MineruError('MINERU_PARSE_FAILED', 'No zip URL in result'); + } + + const { markdown, figures } = await downloadAndExtract(zipUrl, outputDir); + + // Step 5: 提取figure captions + const figureMap = extractFigureCaptions(markdown); + + return { + markdown, + figures, + pageCount: batchResult.page_count || null, + figureMap + }; +} + +module.exports = { + parsePdf, + MineruError +}; diff --git a/Plugin/PaperReader/lib/pdf-parse-fallback.js b/Plugin/PaperReader/lib/pdf-parse-fallback.js new file mode 100644 index 00000000..3edb96ab --- /dev/null +++ b/Plugin/PaperReader/lib/pdf-parse-fallback.js @@ -0,0 +1,35 @@ +/** + * pdf-parse 降级回退封装 (T2) + * + * 当 MinerU API 不可用时,回退到本地 pdf-parse 纯文本抽取。 + * 输出格式与 mineru-client.js 对齐,但 figures 为空,markdown 为纯文本。 + */ + +const fs = require('fs').promises; +const pdfParse = require('pdf-parse'); + +/** + * 使用 pdf-parse 做纯文本抽取(降级模式) + * + * @param {string} pdfPath - PDF 绝对路径 + * @returns {Promise<{ markdown: string, figures: [], pageCount: number, figureMap: [], degraded: true }>} + */ +async function parsePdf(pdfPath) { + const buffer = await fs.readFile(pdfPath); + const parsed = await pdfParse(buffer); + + const rawText = parsed.text || ''; + const markdown = rawText + .replace(/\r\n/g, '\n') + .replace(/\n{3,}/g, '\n\n'); + + return { + markdown, + figures: [], + pageCount: parsed.numpages || null, + figureMap: [], + degraded: true + }; +} + +module.exports = { parsePdf }; diff --git a/Plugin/PaperReader/lib/query.js b/Plugin/PaperReader/lib/query.js new file mode 100644 index 00000000..16b21762 --- /dev/null +++ b/Plugin/PaperReader/lib/query.js @@ -0,0 +1,108 @@ +/** + * Query 问答模块 (T7) + * + * Phase 1: 关键词匹配挑选相关 chunk + LLM 问答 + * Phase 2: 升级为向量检索 + */ + +const fs = require('fs').promises; +const fsSync = require('fs'); +const path = require('path'); +const { callLLM } = require('./llm'); + +const WORKSPACE_ROOT = path.join(__dirname, '..', 'workspace'); + +/** + * 关键词匹配挑选相关 chunk + */ +function keywordPick(chunks, question, topK = 6) { + const q = String(question || '').toLowerCase().trim(); + if (!q) return chunks.slice(0, topK); + + const words = q.split(/[\s,;,;。?!?!]+/).filter(w => w.length >= 2).slice(0, 15); + + const scored = chunks.map(c => { + const text = (c.text || '').toLowerCase(); + const section = (c.section || '').toLowerCase(); + let score = 0; + for (const w of words) { + if (text.includes(w)) score += 1; + if (section.includes(w)) score += 2; + } + return { chunk: c, score }; + }); + + scored.sort((a, b) => b.score - a.score); + return scored.slice(0, topK).filter(s => s.score > 0).map(s => s.chunk); +} + +/** + * 对已导入的文档做检索式问答 + * + * @param {string} paperId + * @param {string} question + * @returns {Promise<{ paperId, answer, sources: Array }>} + */ +async function queryPaper(paperId, question) { + if (!paperId) throw new Error('Query requires paperId'); + if (!question) throw new Error('Query requires question'); + + const wsDir = path.join(WORKSPACE_ROOT, paperId); + const manifestPath = path.join(wsDir, 'chunks', 'manifest.json'); + + if (!fsSync.existsSync(manifestPath)) { + throw new Error(`chunks/manifest.json not found: ${manifestPath}`); + } + + const manifest = JSON.parse(await fs.readFile(manifestPath, 'utf-8')); + const chunks = manifest.chunks || []; + + // Load Global Map if exists + const globalMapPath = path.join(wsDir, 'reading_notes', 'Global_Map.md'); + const globalMap = fsSync.existsSync(globalMapPath) + ? await fs.readFile(globalMapPath, 'utf-8') + : ''; + + // Pick relevant chunks + const picked = keywordPick(chunks, question, 6); + const contextChunks = picked.length > 0 ? picked : chunks.slice(0, 4); + + // Read chunk files for full content + const contextParts = []; + for (const c of contextChunks) { + const chunkPath = path.join(wsDir, 'chunks', `chunk_${c.index}.md`); + let text; + if (fsSync.existsSync(chunkPath)) { + text = await fs.readFile(chunkPath, 'utf-8'); + } else { + text = c.text || ''; + } + contextParts.push(`---\n[chunk ${c.index} | 章节: ${c.section || 'unknown'}]\n${text}`); + } + const context = contextParts.join('\n\n'); + + const system = [ + '你是一个"文档问答助手",适用于各类长文档(学术论文、技术报告、书籍、法律文书等)。', + '只根据提供的上下文回答;若上下文不足,明确说"证据不足",并给出下一步需要检索的章节/关键词。', + '输出:先给结论,再给证据引用(标注 chunk index 和章节名)。' + ].join('\n'); + + const user = [ + globalMap ? `全局地图:\n${globalMap.slice(0, 2000)}` : '', + `问题:${question}`, + `上下文:\n${context}` + ].filter(Boolean).join('\n\n'); + + const answer = await callLLM([ + { role: 'system', content: system }, + { role: 'user', content: user } + ], { temperature: 0.2 }); + + return { + paperId, + answer, + sources: contextChunks.map(c => ({ index: c.index, section: c.section })) + }; +} + +module.exports = { queryPaper }; diff --git a/Plugin/PaperReader/lib/skeleton.js b/Plugin/PaperReader/lib/skeleton.js new file mode 100644 index 00000000..dd200cf3 --- /dev/null +++ b/Plugin/PaperReader/lib/skeleton.js @@ -0,0 +1,161 @@ +/** + * Skeleton 骨架提取重构 (T5) + * + * 从 Markdown 结构提取目录树、Abstract、Conclusion、Figure Caption, + * 生成 Global Map。不再只读首尾2块。 + */ + +const fs = require('fs').promises; +const fsSync = require('fs'); +const path = require('path'); +const { callLLM } = require('./llm'); +const { extractSections } = require('./chunker'); + +const WORKSPACE_ROOT = path.join(__dirname, '..', 'workspace'); + +/** + * 从 Markdown 提取目录树(标题列表) + */ +function extractTOC(markdown) { + const lines = markdown.split('\n'); + const toc = []; + for (const line of lines) { + const match = line.match(/^(#{1,4})\s+(.+)$/); + if (match) { + toc.push({ + level: match[1].length, + title: match[2].trim(), + indent: ' '.repeat(match[1].length - 1) + }); + } + } + return toc; +} + +/** + * 提取关键章节全文 + */ +function extractKeySections(sections) { + const keyPatterns = [ + /abstract/i, + /introduction/i, + /conclusion/i, + /discussion/i, + /summary/i, + /overview/i, + /background/i, + /preface/i, + /executive.?summary/i, + /摘要/, + /引言/, + /结论/, + /讨论/, + /概述/, + /背景/, + /前言/, + /总结/ + ]; + + const found = []; + for (const section of sections) { + for (const pattern of keyPatterns) { + if (pattern.test(section.title)) { + found.push(section); + break; + } + } + } + return found; +} + +/** + * 从 figure_map.json 加载 Figure Captions + */ +async function loadFigureCaptions(wsDir) { + const figMapPath = path.join(wsDir, 'figure_map.json'); + if (!fsSync.existsSync(figMapPath)) return []; + const raw = await fs.readFile(figMapPath, 'utf-8'); + try { + return JSON.parse(raw); + } catch { + return []; + } +} + +/** + * 从 Markdown 结构提取骨架并生成 Global Map + * + * @param {string} paperId + * @param {object} options - { focus } + * @returns {Promise<{ globalMapPath: string, globalMapContent: string }>} + */ +async function generateSkeleton(paperId, options = {}) { + const wsDir = path.join(WORKSPACE_ROOT, paperId); + const mdPath = path.join(wsDir, 'full_text.md'); + const metaPath = path.join(wsDir, 'meta.json'); + + if (!fsSync.existsSync(mdPath)) { + throw new Error(`full_text.md not found: ${mdPath}`); + } + + const markdown = await fs.readFile(mdPath, 'utf-8'); + const meta = fsSync.existsSync(metaPath) + ? JSON.parse(await fs.readFile(metaPath, 'utf-8')) + : {}; + + // 1. 提取目录树 + const toc = extractTOC(markdown); + const tocText = toc.map(t => `${t.indent}- ${t.title}`).join('\n'); + + // 2. 提取关键章节 + const sections = extractSections(markdown); + const keySections = extractKeySections(sections); + const keyText = keySections + .map(s => `### ${s.title}\n${s.content.slice(0, 3000)}`) + .join('\n\n'); + + // 3. 加载 Figure Captions + const figureCaptions = await loadFigureCaptions(wsDir); + const captionText = figureCaptions.length > 0 + ? figureCaptions.map(f => `- ${f.label || f.id}: ${f.caption}`).join('\n') + : '(无图注信息)'; + + // 4. 构建 LLM prompt + const system = [ + '你是一个"文档骨架提取器",适用于各类长文档(学术论文、技术报告、书籍章节、法律文书等)。', + '目标:基于目录结构、关键章节和图注,提取文档的全局地图(Global Map)。', + '输出 Markdown,根据文档类型自适应包含以下要素:', + '1. 核心主题(1-2句话概括本文档的核心内容)', + '2. 核心问题/目的(本文档要解决什么问题或传达什么信息)', + '3. 关键内容概要(主要论点、方法、流程、条款等——依文档类型而定)', + '4. 结构路线图(文档的组织逻辑和各部分之间的关系)', + '5. 主要结论/要点', + '6. 局限性/注意事项/风险点', + '7. 各章节阅读优先级标签(High/Medium/Low)', + '8. 后续深读建议(重点关注哪些章节/图表/附录)', + '引用原文短句时标注来自哪个章节。' + ].join('\n'); + + const user = [ + `阅读焦点:${options.focus || '通用理解(全面掌握文档核心内容与结构)'}`, + `元信息:页数=${meta.pageCount ?? 'unknown'}`, + `\n【目录结构】\n${tocText}`, + `\n【关键章节内容】\n${keyText.slice(0, 15000)}`, + `\n【图注列表】\n${captionText}` + ].join('\n\n'); + + const content = await callLLM([ + { role: 'system', content: system }, + { role: 'user', content: user } + ]); + + // 5. 写入 Global_Map.md + const notesDir = path.join(wsDir, 'reading_notes'); + await fs.mkdir(notesDir, { recursive: true }); + const outPath = path.join(notesDir, 'Global_Map.md'); + await fs.writeFile(outPath, content || '', 'utf-8'); + + return { globalMapPath: outPath, globalMapContent: content }; +} + +module.exports = { generateSkeleton, extractTOC, extractKeySections }; diff --git a/Plugin/PaperReader/plugin-manifest.json b/Plugin/PaperReader/plugin-manifest.json new file mode 100644 index 00000000..580fa8e6 --- /dev/null +++ b/Plugin/PaperReader/plugin-manifest.json @@ -0,0 +1,75 @@ +{ + "manifestVersion": "1.0.0", + "name": "PaperReader", + "version": "0.2.0", + "displayName": "超文本递归阅读器", + "description": "将超长 PDF/文档转为可递归阅读的分块工件(Skeleton/Deep/Query)。适用于学术论文、技术报告、书籍章节、法律文书等各类长文档。v0.2: MinerU 云端高保真解析(公式/表格/图片/多栏/扫描OCR)+ 章节感知切分 + Rolling Context 深度阅读。MinerU 不可用时自动降级到 pdf-parse。", + "author": "VCP", + "pluginType": "synchronous", + "entryPoint": { + "type": "nodejs", + "command": "node PaperReader.js" + }, + "communication": { + "protocol": "stdio", + "timeout": 600000 + }, + "configSchema": { + "MINERU_API_TOKEN": { + "type": "string", + "description": "MinerU 云端 API Token(从 mineru.net 获取)。不填则自动降级到 pdf-parse。" + }, + "MINERU_API_TIMEOUT": { + "type": "integer", + "description": "MinerU 轮询超时(ms),默认 300000(5分钟)。" + }, + "MINERU_POLL_INTERVAL": { + "type": "integer", + "description": "MinerU 轮询间隔(ms),默认 5000。" + }, + "PaperReaderChunkSize": { + "type": "integer", + "description": "目标 chunk 大小(tokens),默认 2000。" + }, + "PaperReaderOverlap": { + "type": "number", + "description": "chunk 重叠比例,默认 0.15。" + }, + "PaperReaderModel": { + "type": "string", + "description": "用于阅读/总结的模型名称。" + }, + "PaperReaderMaxOutputTokens": { + "type": "integer", + "description": "单次模型输出上限,默认 12000。" + }, + "PaperReaderBatchSize": { + "type": "integer", + "description": "Deep 阅读分批处理的 chunk 数,默认 4。" + }, + "PaperReaderMaxChunks": { + "type": "integer", + "description": "单次 deep 阅读最多处理的 chunk 数(防止成本失控),默认 120。" + } + }, + "capabilities": { + "invocationCommands": [ + { + "commandIdentifier": "IngestPDF", + "description": "解析 PDF 并生成可递归阅读的分块工件。支持学术论文、技术报告、书籍、法律文书等各类 PDF。使用 MinerU 云端 API 高保真解析(保留公式/表格/图片/多栏排版),失败自动降级到 pdf-parse。输出章节感知的 chunk 文件。\n参数:\n- command: 固定为 IngestPDF\n- filePath (字符串, 必需): PDF 绝对路径\n- paperId (字符串, 可选): 自定义 ID(不传则自动生成)\n调用格式:\n<<<[TOOL_REQUEST]>>>\ntool_name:「始」PaperReader「末」,\ncommand:「始」IngestPDF「末」,\nfilePath:「始」D:\\\\books\\\\paper.pdf「末」\n<<<[END_TOOL_REQUEST]>>>" + }, + { + "commandIdentifier": "ReadSkeleton", + "description": "基于已导入的文档工件生成骨架地图(Global Map)。从 Markdown 结构提取目录树、关键章节和图注,生成带阅读优先级标签的全局地图。适用于任何已 Ingest 的文档。\n参数:\n- command: 固定为 ReadSkeleton\n- paperId (字符串, 必需)\n- focus (字符串, 可选): 本次阅读关注点\n调用格式:\n<<<[TOOL_REQUEST]>>>\ntool_name:「始」PaperReader「末」,\ncommand:「始」ReadSkeleton「末」,\npaperId:「始」paper-xxx「末」,\nfocus:「始」方法学与实验设计「末」\n<<<[END_TOOL_REQUEST]>>>" + }, + { + "commandIdentifier": "ReadDeep", + "description": "带 Rolling Context 的深度阅读:逐 chunk 摘要时携带前序累积的关键事实,保持 chunk 间连贯性。超出上限自动压缩。最终合并为 Round-1 深度笔记。\n参数:\n- command: 固定为 ReadDeep\n- paperId (字符串, 必需)\n- goal (字符串, 可选): 主任务目标(用于决定摘要粒度)\n调用格式:\n<<<[TOOL_REQUEST]>>>\ntool_name:「始」PaperReader「末」,\ncommand:「始」ReadDeep「末」,\npaperId:「始」paper-xxx「末」,\ngoal:「始」快速理解核心贡献与可复现实验步骤「末」\n<<<[END_TOOL_REQUEST]>>>" + }, + { + "commandIdentifier": "Query", + "description": "对已导入的文档做检索式问答(关键词匹配 + 章节权重挑选相关 chunk)。返回答案及引用来源。\n参数:\n- command: 固定为 Query\n- paperId (字符串, 必需)\n- question (字符串, 必需)\n调用格式:\n<<<[TOOL_REQUEST]>>>\ntool_name:「始」PaperReader「末」,\ncommand:「始」Query「末」,\npaperId:「始」paper-xxx「末」,\nquestion:「始」这份文档的核心结论是什么?「末」\n<<<[END_TOOL_REQUEST]>>>" + } + ] + } +} From 9fdeda2ee2823fbe836403a73a45088ee2ad44c9 Mon Sep 17 00:00:00 2001 From: rongfeng Date: Sun, 8 Feb 2026 02:51:04 +0800 Subject: [PATCH 2/6] fix: correct MinerU API model_version and upload headers per official docs - model_version: use 'pipeline' (default) or 'vlm', not 'hybrid-auto-engine' - Remove Content-Type header from PUT upload (per MinerU docs) - Add MINERU_MODEL_VERSION config option - Clarify config.env.example: Bearer Token, not Access Key/Secret Key --- Plugin/PaperReader/config.env.example | 6 +++++- Plugin/PaperReader/lib/mineru-client.js | 13 ++++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/Plugin/PaperReader/config.env.example b/Plugin/PaperReader/config.env.example index fbb51b8a..92ffb42f 100644 --- a/Plugin/PaperReader/config.env.example +++ b/Plugin/PaperReader/config.env.example @@ -1,8 +1,12 @@ # PaperReader 插件配置(示例) # === L0 解析层 === -# MinerU 云端 API Token(从 mineru.net 获取) +# MinerU 云端 API Token(从 mineru.net 网站的「个人中心 → API密钥管理」获取) +# 注意:这里填的是 Bearer Token(一串长字符串),不是 Access Key / Secret Key +# 不填则自动降级到 pdf-parse 纯文本模式 MINERU_API_TOKEN= +# MinerU 模型版本:pipeline(默认,速度快)或 vlm(效果更好,速度较慢) +MINERU_MODEL_VERSION=pipeline # 轮询超时(ms),默认 5 分钟 MINERU_API_TIMEOUT=300000 # 轮询间隔(ms),默认 5 秒 diff --git a/Plugin/PaperReader/lib/mineru-client.js b/Plugin/PaperReader/lib/mineru-client.js index f0ceb760..f197a44b 100644 --- a/Plugin/PaperReader/lib/mineru-client.js +++ b/Plugin/PaperReader/lib/mineru-client.js @@ -21,10 +21,12 @@ class MineruError extends Error { /** * 获取预签名上传URL */ -async function getUploadUrl(token, fileName) { +async function getUploadUrl(token, fileName, modelVersion) { const resp = await axios.post(`${MINERU_API_BASE}/file-urls/batch`, { files: [{ name: fileName, data_id: `pr_${Date.now()}` }], - model_version: 'hybrid-auto-engine' + enable_formula: true, + enable_table: true, + model_version: modelVersion }, { headers: { 'Content-Type': 'application/json', @@ -49,8 +51,8 @@ async function getUploadUrl(token, fileName) { */ async function uploadFile(uploadUrl, filePath) { const fileBuffer = await fs.readFile(filePath); + // MinerU 文档明确说明:上传文件时无须设置 Content-Type 请求头 await axios.put(uploadUrl, fileBuffer, { - headers: { 'Content-Type': 'application/octet-stream' }, timeout: 120000, maxContentLength: 200 * 1024 * 1024 }); @@ -164,7 +166,7 @@ function extractFigureCaptions(markdown) { * 完整流程:上传 PDF → 提交解析 → 轮询 → 返回结果 * * @param {string} pdfPath - PDF 绝对路径 - * @param {object} options - { token, timeout, pollInterval, outputDir } + * @param {object} options - { token, timeout, pollInterval, outputDir, modelVersion } * @returns {Promise<{ markdown: string, figures: Array, pageCount: number, figureMap: Array }>} */ async function parsePdf(pdfPath, options = {}) { @@ -175,12 +177,13 @@ async function parsePdf(pdfPath, options = {}) { const timeout = options.timeout || parseInt(process.env.MINERU_API_TIMEOUT || '300000', 10); const pollInterval = options.pollInterval || parseInt(process.env.MINERU_POLL_INTERVAL || '5000', 10); + const modelVersion = options.modelVersion || process.env.MINERU_MODEL_VERSION || 'pipeline'; const fileName = path.basename(pdfPath); const outputDir = options.outputDir || path.dirname(pdfPath); // Step 1: 获取上传URL - const { uploadUrl, batchId } = await getUploadUrl(token, fileName); + const { uploadUrl, batchId } = await getUploadUrl(token, fileName, modelVersion); // Step 2: 上传文件 await uploadFile(uploadUrl, pdfPath); From 499bd1a3ade7c50ed9a8af91ad0c8293b6e0dc18 Mon Sep 17 00:00:00 2001 From: rongfeng Date: Sun, 8 Feb 2026 03:53:26 +0800 Subject: [PATCH 3/6] fix: MinerU OSS upload 403 + pdf-parse v2 API compatibility - mineru-client.js: replace axios.put with native https.request for OSS presigned URL upload (axios auto-adds headers that break signature) - pdf-parse-fallback.js: rewrite for pdf-parse v2 API (PDFParse class + Uint8Array + getText/destroy) --- Plugin/PaperReader/lib/mineru-client.js | 35 ++++++++++++++++++-- Plugin/PaperReader/lib/pdf-parse-fallback.js | 28 ++++++++++++---- 2 files changed, 54 insertions(+), 9 deletions(-) diff --git a/Plugin/PaperReader/lib/mineru-client.js b/Plugin/PaperReader/lib/mineru-client.js index f197a44b..56ba0183 100644 --- a/Plugin/PaperReader/lib/mineru-client.js +++ b/Plugin/PaperReader/lib/mineru-client.js @@ -52,9 +52,38 @@ async function getUploadUrl(token, fileName, modelVersion) { async function uploadFile(uploadUrl, filePath) { const fileBuffer = await fs.readFile(filePath); // MinerU 文档明确说明:上传文件时无须设置 Content-Type 请求头 - await axios.put(uploadUrl, fileBuffer, { - timeout: 120000, - maxContentLength: 200 * 1024 * 1024 + // axios 会自动添加 Content-Type/Accept 等头部,导致 OSS 预签名 URL 签名校验失败 + // 改用 Node 原生 https 模块,只发送 Content-Length,完全匹配 Python requests.put(url, data=f) 的行为 + const { URL } = require('url'); + const https = require('https'); + const parsedUrl = new URL(uploadUrl); + + await new Promise((resolve, reject) => { + const req = https.request({ + hostname: parsedUrl.hostname, + port: parsedUrl.port || 443, + path: parsedUrl.pathname + parsedUrl.search, + method: 'PUT', + headers: { + 'Content-Length': fileBuffer.length + }, + timeout: 120000 + }, (res) => { + let body = ''; + res.on('data', chunk => body += chunk); + res.on('end', () => { + if (res.statusCode >= 200 && res.statusCode < 300) { + resolve(); + } else { + reject(new MineruError('MINERU_UPLOAD_FAILED', + `Upload failed: HTTP ${res.statusCode} - ${body.slice(0, 200)}`)); + } + }); + }); + req.on('error', reject); + req.on('timeout', () => { req.destroy(); reject(new MineruError('MINERU_UPLOAD_FAILED', 'Upload timeout')); }); + req.write(fileBuffer); + req.end(); }); } diff --git a/Plugin/PaperReader/lib/pdf-parse-fallback.js b/Plugin/PaperReader/lib/pdf-parse-fallback.js index 3edb96ab..006f96b7 100644 --- a/Plugin/PaperReader/lib/pdf-parse-fallback.js +++ b/Plugin/PaperReader/lib/pdf-parse-fallback.js @@ -1,24 +1,40 @@ /** * pdf-parse 降级回退封装 (T2) - * + * * 当 MinerU API 不可用时,回退到本地 pdf-parse 纯文本抽取。 * 输出格式与 mineru-client.js 对齐,但 figures 为空,markdown 为纯文本。 + * + * pdf-parse v2 API: new PDFParse({ data: Uint8Array }) → getText() → destroy() */ const fs = require('fs').promises; -const pdfParse = require('pdf-parse'); +const { PDFParse } = require('pdf-parse'); /** * 使用 pdf-parse 做纯文本抽取(降级模式) - * + * * @param {string} pdfPath - PDF 绝对路径 * @returns {Promise<{ markdown: string, figures: [], pageCount: number, figureMap: [], degraded: true }>} */ async function parsePdf(pdfPath) { const buffer = await fs.readFile(pdfPath); - const parsed = await pdfParse(buffer); + // pdf-parse v2 要求 Uint8Array 而非 Buffer + const uint8 = new Uint8Array(buffer.buffer, buffer.byteOffset, buffer.byteLength); + + const parser = new PDFParse({ data: uint8 }); + let pageCount = null; + let rawText = ''; + + try { + const info = await parser.getInfo(); + pageCount = info.total || null; + + const textResult = await parser.getText(); + rawText = textResult.text || ''; + } finally { + await parser.destroy(); + } - const rawText = parsed.text || ''; const markdown = rawText .replace(/\r\n/g, '\n') .replace(/\n{3,}/g, '\n\n'); @@ -26,7 +42,7 @@ async function parsePdf(pdfPath) { return { markdown, figures: [], - pageCount: parsed.numpages || null, + pageCount, figureMap: [], degraded: true }; From 3614ecdcc49341c96a8f90d3e437938efab1f2c3 Mon Sep 17 00:00:00 2001 From: rongfeng Date: Sun, 8 Feb 2026 04:28:24 +0800 Subject: [PATCH 4/6] fix: return content to model + 429 retry with exponential backoff - PaperReader.js: ReadSkeleton/ReadDeep now return actual text content - llm.js: add 429 exponential backoff retry (5 attempts, 3s/6s/12s/24s) - deep-reader.js: add 1.5s inter-chunk delay to prevent rate limiting --- Plugin/PaperReader/PaperReader.js | 9 +++++++-- Plugin/PaperReader/lib/deep-reader.js | 9 +++++++++ Plugin/PaperReader/lib/llm.js | 26 ++++++++++++++++++++------ 3 files changed, 36 insertions(+), 8 deletions(-) diff --git a/Plugin/PaperReader/PaperReader.js b/Plugin/PaperReader/PaperReader.js index 9b3b522d..6335fa21 100644 --- a/Plugin/PaperReader/PaperReader.js +++ b/Plugin/PaperReader/PaperReader.js @@ -122,12 +122,17 @@ async function handleIngestPDF({ filePath, paperId }) { async function handleReadSkeleton({ paperId, focus }) { if (!paperId) throw new Error('ReadSkeleton requires paperId'); const result = await generateSkeleton(paperId, { focus }); - return { paperId, globalMapPath: result.globalMapPath }; + return { paperId, globalMapPath: result.globalMapPath, content: result.globalMapContent }; } async function handleReadDeep({ paperId, goal }) { if (!paperId) throw new Error('ReadDeep requires paperId'); - return await readDeep(paperId, { goal }); + const result = await readDeep(paperId, { goal }); + // Read the Round_1_Summary.md to return its content + const summaryContent = fsSync.existsSync(result.roundPath) + ? (await fs.readFile(result.roundPath, 'utf-8')) + : ''; + return { ...result, content: summaryContent }; } async function handleQuery({ paperId, question }) { diff --git a/Plugin/PaperReader/lib/deep-reader.js b/Plugin/PaperReader/lib/deep-reader.js index 05ab5890..ebd3fbde 100644 --- a/Plugin/PaperReader/lib/deep-reader.js +++ b/Plugin/PaperReader/lib/deep-reader.js @@ -15,6 +15,7 @@ const WORKSPACE_ROOT = path.join(__dirname, '..', 'workspace'); const BATCH_SIZE = parseInt(process.env.PaperReaderBatchSize || '4', 10); const MAX_CHUNKS = parseInt(process.env.PaperReaderMaxChunks || '120', 10); const ROLLING_CONTEXT_MAX_TOKENS = 4000; +const CHUNK_DELAY_MS = parseInt(process.env.PaperReaderChunkDelay || '1500', 10); /** * 压缩 Rolling Context(当超过上限时) @@ -101,10 +102,13 @@ async function readDeep(paperId, options = {}) { let rollingContext = ''; // Sequential processing with Rolling Context + // Process in small batches but maintain rolling context between batches for (let i = 0; i < limited.length; i += batchSize) { const batch = limited.slice(i, i + batchSize); + // Within a batch, process sequentially to maintain rolling context for (const chunk of batch) { + // Read chunk content const chunkPath = path.join(chunksDir, `chunk_${chunk.index}.md`); let chunkText; if (fsSync.existsSync(chunkPath)) { @@ -113,6 +117,11 @@ async function readDeep(paperId, options = {}) { chunkText = chunk.text || ''; } + // Delay between LLM calls to avoid 429 rate limiting (skip first chunk) + if (summaries.length > 0) { + await new Promise(r => setTimeout(r, CHUNK_DELAY_MS)); + } + const summary = await summarizeChunk(chunkText, { goal, globalMap, diff --git a/Plugin/PaperReader/lib/llm.js b/Plugin/PaperReader/lib/llm.js index abe09376..e253ed5b 100644 --- a/Plugin/PaperReader/lib/llm.js +++ b/Plugin/PaperReader/lib/llm.js @@ -42,12 +42,26 @@ async function callLLM(messages, { max_tokens = MAX_OUTPUT_TOKENS, temperature = temperature }; - const resp = await axios.post(API_URL, payload, { - headers: { Authorization: `Bearer ${API_KEY}`, 'Content-Type': 'application/json' }, - timeout: 180000 - }); - - return resp?.data?.choices?.[0]?.message?.content || ''; + const maxRetries = 5; + for (let attempt = 0; attempt < maxRetries; attempt++) { + try { + const resp = await axios.post(API_URL, payload, { + headers: { Authorization: `Bearer ${API_KEY}`, 'Content-Type': 'application/json' }, + timeout: 180000 + }); + return resp?.data?.choices?.[0]?.message?.content || ''; + } catch (err) { + const status = err?.response?.status; + if (status === 429 && attempt < maxRetries - 1) { + // Exponential backoff: 3s, 6s, 12s, 24s + const delay = 3000 * Math.pow(2, attempt); + process.stderr.write(`[PaperReader] 429 rate limit, retrying in ${delay / 1000}s (attempt ${attempt + 1}/${maxRetries})\n`); + await new Promise(r => setTimeout(r, delay)); + continue; + } + throw err; + } + } } /** From dce4176a850f648f6981fc4e62f775db96bb747b Mon Sep 17 00:00:00 2001 From: rongfeng Date: Sun, 8 Feb 2026 06:11:11 +0800 Subject: [PATCH 5/6] fix(PaperReader): fix LLM URL resolution, add concurrency & caching - lib/llm.js: auto-resolve localhost URL to correct VCP port + path; add error classifier (rate_limit/auth/timeout/network); add traceTag logging - lib/deep-reader.js: batch-internal concurrency via Promise.all; full cache (skip if Round_1_Summary.md exists); incremental cache (skip already-summarized chunks) - PaperReader.js: IngestPDF cache (skip if manifest+meta exist); route logging; passthrough forceReparse/forceReread params --- Plugin/PaperReader/PaperReader.js | 48 ++++++++++-- Plugin/PaperReader/lib/deep-reader.js | 102 ++++++++++++++++++++------ Plugin/PaperReader/lib/llm.js | 99 ++++++++++++++++++++++--- 3 files changed, 206 insertions(+), 43 deletions(-) diff --git a/Plugin/PaperReader/PaperReader.js b/Plugin/PaperReader/PaperReader.js index 6335fa21..e6c9bb51 100644 --- a/Plugin/PaperReader/PaperReader.js +++ b/Plugin/PaperReader/PaperReader.js @@ -39,7 +39,7 @@ async function writeJson(filePath, obj) { // ─── Command Handlers ─── -async function handleIngestPDF({ filePath, paperId }) { +async function handleIngestPDF({ filePath, paperId, forceReparse }) { if (!filePath || typeof filePath !== 'string') { throw new Error('IngestPDF requires filePath'); } @@ -54,6 +54,26 @@ async function handleIngestPDF({ filePath, paperId }) { : `paper-${sha1(abs).slice(0, 10)}`; const wsDir = getPaperWorkspace(resolvedPaperId); + const manifestPath = path.join(wsDir, 'chunks', 'manifest.json'); + const metaPath = path.join(wsDir, 'meta.json'); + + // ── Cache check: if manifest + meta already exist, skip re-parsing ── + if (!forceReparse && fsSync.existsSync(manifestPath) && fsSync.existsSync(metaPath)) { + const existingMeta = JSON.parse(await fs.readFile(metaPath, 'utf-8')); + const existingManifest = JSON.parse(await fs.readFile(manifestPath, 'utf-8')); + process.stderr.write(`[PaperReader][Ingest] cache hit: paperId=${resolvedPaperId}, chunkCount=${existingManifest.chunkCount}, engine=${existingMeta.engine}\n`); + return { + paperId: resolvedPaperId, + workspace: wsDir, + pageCount: existingMeta.pageCount, + chunkCount: existingManifest.chunkCount, + engine: existingMeta.engine, + cached: true + }; + } + + process.stderr.write(`[PaperReader][Ingest] no cache, starting full parse: paperId=${resolvedPaperId}\n`); + await fs.mkdir(wsDir, { recursive: true }); // L0: 解析 PDF → Markdown + Figures @@ -68,7 +88,7 @@ async function handleIngestPDF({ filePath, paperId }) { textLength: (parsed.markdown || '').length, engine: parsed.engine }; - await writeJson(path.join(wsDir, 'meta.json'), meta); + await writeJson(metaPath, meta); // Save full markdown await fs.writeFile(path.join(wsDir, 'full_text.md'), parsed.markdown || '', 'utf-8'); @@ -105,7 +125,7 @@ async function handleIngestPDF({ filePath, paperId }) { tokenCount: c.tokenCount })) }; - await writeJson(path.join(chunksDir, 'manifest.json'), manifest); + await writeJson(manifestPath, manifest); // Create reading_notes dir await fs.mkdir(path.join(wsDir, 'reading_notes'), { recursive: true }); @@ -115,7 +135,8 @@ async function handleIngestPDF({ filePath, paperId }) { workspace: wsDir, pageCount: meta.pageCount, chunkCount: chunks.length, - engine: parsed.engine + engine: parsed.engine, + cached: false }; } @@ -125,9 +146,13 @@ async function handleReadSkeleton({ paperId, focus }) { return { paperId, globalMapPath: result.globalMapPath, content: result.globalMapContent }; } -async function handleReadDeep({ paperId, goal }) { +async function handleReadDeep({ paperId, goal, maxChunks, batchSize, forceReread }) { if (!paperId) throw new Error('ReadDeep requires paperId'); - const result = await readDeep(paperId, { goal }); + const opts = { goal }; + if (maxChunks) opts.maxChunks = maxChunks; + if (batchSize) opts.batchSize = batchSize; + if (forceReread) opts.forceReread = true; + const result = await readDeep(paperId, opts); // Read the Round_1_Summary.md to return its content const summaryContent = fsSync.existsSync(result.roundPath) ? (await fs.readFile(result.roundPath, 'utf-8')) @@ -149,21 +174,27 @@ async function main() { const request = JSON.parse(inputData || '{}'); const command = request.command; + process.stderr.write(`[PaperReader][Main] request received: command=${command || 'undefined'}, paperId=${request.paperId || 'n/a'}\n`); + try { if (!command) throw new Error('Missing command'); let result; switch (command) { case 'IngestPDF': - result = await handleIngestPDF({ filePath: request.filePath, paperId: request.paperId }); + process.stderr.write('[PaperReader][Main] route hit: IngestPDF\n'); + result = await handleIngestPDF({ filePath: request.filePath, paperId: request.paperId, forceReparse: request.forceReparse }); break; case 'ReadSkeleton': + process.stderr.write('[PaperReader][Main] route hit: ReadSkeleton\n'); result = await handleReadSkeleton({ paperId: request.paperId, focus: request.focus }); break; case 'ReadDeep': - result = await handleReadDeep({ paperId: request.paperId, goal: request.goal }); + process.stderr.write('[PaperReader][Main] route hit: ReadDeep\n'); + result = await handleReadDeep({ paperId: request.paperId, goal: request.goal, maxChunks: request.maxChunks, batchSize: request.batchSize, forceReread: request.forceReread }); break; case 'Query': + process.stderr.write('[PaperReader][Main] route hit: Query\n'); result = await handleQuery({ paperId: request.paperId, question: request.question }); break; default: @@ -172,6 +203,7 @@ async function main() { sendResponse({ status: 'success', result }); } catch (err) { + process.stderr.write(`[PaperReader][Main] request failed: command=${command || 'undefined'}, error=${err?.message || String(err)}\n`); sendResponse({ status: 'error', error: err?.message || String(err) }); } } diff --git a/Plugin/PaperReader/lib/deep-reader.js b/Plugin/PaperReader/lib/deep-reader.js index ebd3fbde..7b0c2bb5 100644 --- a/Plugin/PaperReader/lib/deep-reader.js +++ b/Plugin/PaperReader/lib/deep-reader.js @@ -56,7 +56,7 @@ async function summarizeChunk(chunkText, { goal, globalMap, rollingContext, chun const result = await callLLMJson([ { role: 'system', content: system }, { role: 'user', content: userParts.join('\n\n') } - ], { temperature: 0.1 }); + ], { temperature: 0.1, traceTag: `DeepReader:chunk_${chunkIndex}` }); // Normalize result return { @@ -79,6 +79,18 @@ async function readDeep(paperId, options = {}) { const wsDir = path.join(WORKSPACE_ROOT, paperId); const chunksDir = path.join(wsDir, 'chunks'); const manifestPath = path.join(chunksDir, 'manifest.json'); + const notesDir = path.join(wsDir, 'reading_notes'); + const summariesPath = path.join(notesDir, 'Chunk_Summaries.json'); + const roundPath = path.join(notesDir, 'Round_1_Summary.md'); + + process.stderr.write(`[PaperReader][DeepReader] start: paperId=${paperId}, goal=${options.goal || '(default)'}\n`); + + // ── Cache check: if Round_1_Summary.md already exists, return directly ── + if (!options.forceReread && fsSync.existsSync(roundPath) && fsSync.existsSync(summariesPath)) { + const existingSummaries = JSON.parse(await fs.readFile(summariesPath, 'utf-8')); + process.stderr.write(`[PaperReader][DeepReader] cache hit: Round_1_Summary.md exists (${existingSummaries.count} chunk summaries). Returning cached result.\n`); + return { paperId, summariesPath, roundPath, cached: true }; + } if (!fsSync.existsSync(manifestPath)) { throw new Error(`chunks/manifest.json not found: ${manifestPath}`); @@ -98,16 +110,50 @@ async function readDeep(paperId, options = {}) { const goal = options.goal || ''; const limited = chunks.slice(0, maxChunks); - const summaries = []; + let summaries = []; let rollingContext = ''; - // Sequential processing with Rolling Context - // Process in small batches but maintain rolling context between batches + // ── Incremental resume: load existing chunk summaries if available ── + const existingSummariesMap = new Map(); + if (!options.forceReread && fsSync.existsSync(summariesPath)) { + try { + const existing = JSON.parse(await fs.readFile(summariesPath, 'utf-8')); + if (existing.summaries && Array.isArray(existing.summaries)) { + for (const s of existing.summaries) { + existingSummariesMap.set(s.chunkIndex, s); + } + process.stderr.write(`[PaperReader][DeepReader] found ${existingSummariesMap.size} cached chunk summaries, will skip those\n`); + } + } catch { /* ignore corrupt file */ } + } + + process.stderr.write(`[PaperReader][DeepReader] config: totalChunks=${chunks.length}, processing=${limited.length}, batchSize=${batchSize}, chunkDelay=${CHUNK_DELAY_MS}ms\n`); + + // Concurrent batch processing with Rolling Context + // Each batch shares the same rolling context snapshot, chunks within a batch run in parallel. + // After a batch completes, results are merged in order to update rolling context before next batch. for (let i = 0; i < limited.length; i += batchSize) { const batch = limited.slice(i, i + batchSize); + const batchNum = Math.floor(i / batchSize) + 1; + const totalBatches = Math.ceil(limited.length / batchSize); + process.stderr.write(`[PaperReader][DeepReader] batch ${batchNum}/${totalBatches} start (chunks ${i}-${Math.min(i + batchSize, limited.length) - 1}, concurrency=${batch.length})\n`); + + // Delay between batches to avoid rate limiting (skip first batch) + if (i > 0) { + await new Promise(r => setTimeout(r, CHUNK_DELAY_MS)); + } + + // Snapshot rolling context for this batch — all chunks in the batch see the same context + const batchRollingContext = rollingContext; + + // Launch all chunks in this batch concurrently (skip cached ones) + const batchPromises = batch.map(async (chunk) => { + // Check incremental cache + if (existingSummariesMap.has(chunk.index)) { + process.stderr.write(`[PaperReader][DeepReader] chunk ${chunk.index}/${limited.length - 1} (section: ${chunk.section || 'unknown'}) CACHED, skipping LLM\n`); + return existingSummariesMap.get(chunk.index); + } - // Within a batch, process sequentially to maintain rolling context - for (const chunk of batch) { // Read chunk content const chunkPath = path.join(chunksDir, `chunk_${chunk.index}.md`); let chunkText; @@ -117,44 +163,51 @@ async function readDeep(paperId, options = {}) { chunkText = chunk.text || ''; } - // Delay between LLM calls to avoid 429 rate limiting (skip first chunk) - if (summaries.length > 0) { - await new Promise(r => setTimeout(r, CHUNK_DELAY_MS)); - } + process.stderr.write(`[PaperReader][DeepReader] chunk ${chunk.index}/${limited.length - 1} (section: ${chunk.section || 'unknown'}) summarizing...\n`); const summary = await summarizeChunk(chunkText, { goal, globalMap, - rollingContext, + rollingContext: batchRollingContext, chunkIndex: chunk.index, section: chunk.section || 'unknown' }); - summaries.push({ + return { chunkIndex: chunk.index, section: chunk.section, ...summary - }); + }; + }); - // Update Rolling Context - const newFacts = summary.key_facts.join('; '); + // Wait for all chunks in this batch to complete + const batchResults = await Promise.all(batchPromises); + + // Merge results in order + for (const result of batchResults) { + summaries.push(result); + process.stderr.write(`[PaperReader][DeepReader] chunk ${result.chunkIndex} done (${summaries.length}/${limited.length} completed)\n`); + + // Update Rolling Context in order + const newFacts = result.key_facts.join('; '); if (newFacts) { - rollingContext += `\n[Chunk ${chunk.index} - ${chunk.section}]: ${newFacts}`; + rollingContext += `\n[Chunk ${result.chunkIndex} - ${result.section}]: ${newFacts}`; } + } - // Compress if exceeding limit - if (countTokens(rollingContext) > ROLLING_CONTEXT_MAX_TOKENS) { - rollingContext = await compressContext(rollingContext); - } + // Compress rolling context if exceeding limit (once per batch) + if (countTokens(rollingContext) > ROLLING_CONTEXT_MAX_TOKENS) { + process.stderr.write(`[PaperReader][DeepReader] rolling context exceeds ${ROLLING_CONTEXT_MAX_TOKENS} tokens, compressing...\n`); + rollingContext = await compressContext(rollingContext); } } // Save chunk summaries - const notesDir = path.join(wsDir, 'reading_notes'); await fs.mkdir(notesDir, { recursive: true }); - const summariesPath = path.join(notesDir, 'Chunk_Summaries.json'); await fs.writeFile(summariesPath, JSON.stringify({ count: summaries.length, summaries }, null, 2), 'utf-8'); + process.stderr.write(`[PaperReader][DeepReader] all ${summaries.length} chunks summarized, starting synthesis...\n`); + // Synthesis: merge all summaries into Round_1_Summary.md const system = [ '你是一个"长文档合并器",适用于各类文档。', @@ -172,11 +225,12 @@ async function readDeep(paperId, options = {}) { const merged = await callLLM([ { role: 'system', content: system }, { role: 'user', content: user } - ], { temperature: 0.2 }); + ], { temperature: 0.2, traceTag: 'DeepReader:synthesis' }); - const roundPath = path.join(notesDir, 'Round_1_Summary.md'); await fs.writeFile(roundPath, merged || '', 'utf-8'); + process.stderr.write(`[PaperReader][DeepReader] complete: summariesPath=${summariesPath}, roundPath=${roundPath}\n`); + return { paperId, summariesPath, roundPath }; } diff --git a/Plugin/PaperReader/lib/llm.js b/Plugin/PaperReader/lib/llm.js index e253ed5b..3bb0b745 100644 --- a/Plugin/PaperReader/lib/llm.js +++ b/Plugin/PaperReader/lib/llm.js @@ -1,6 +1,6 @@ /** * LLM 调用封装 (T4) - * + * * 从 PaperReader.js 抽出,统一管理模型调用。 */ @@ -10,28 +10,94 @@ const path = require('path'); require('dotenv').config({ path: path.join(__dirname, '..', 'config.env') }); require('dotenv').config({ path: path.join(__dirname, '..', '..', '..', 'config.env') }); -const API_KEY = process.env.API_Key; -const API_URL = process.env.API_URL; +const API_KEY = process.env.PaperReaderApiKey || process.env.Key || process.env.API_Key; +const RAW_API_URL = process.env.PaperReaderApiUrl || process.env.API_URL; +const VCP_PORT = process.env.PORT || '6005'; const MODEL = process.env.PaperReaderModel; const MAX_OUTPUT_TOKENS = parseInt(process.env.PaperReaderMaxOutputTokens || '12000', 10); +function resolveApiUrl() { + let url = RAW_API_URL; + if (!url) return null; + + // If API_URL is just a base like http://127.0.0.1:3000, auto-fix to VCP port + path + // VCP serves its chat completions API on PORT (default 6005), not the admin panel port + if (url.match(/^https?:\/\/(?:127\.0\.0\.1|localhost)(?::\d+)?$/)) { + const base = url.replace(/:\d+$/, ''); + url = `${base}:${VCP_PORT}/v1/chat/completions`; + } + + // Append /v1/chat/completions if URL doesn't already end with a path + if (!url.includes('/v1/') && !url.includes('/chat/')) { + url = url.replace(/\/$/, '') + '/v1/chat/completions'; + } + + return url; +} + +const API_URL = resolveApiUrl(); + function ensureConfig() { if (!API_KEY || !API_URL) { - throw new Error('Missing API config: API_Key/API_URL are required (from repo root config.env).'); + throw new Error( + `Missing API config: API_Key=${API_KEY ? 'set' : 'MISSING'}, API_URL=${API_URL || 'MISSING'} (raw=${RAW_API_URL || 'MISSING'}). ` + + 'Check repo root config.env and Plugin/PaperReader/config.env.' + ); } if (!MODEL) { throw new Error('Missing PaperReaderModel in config.env'); } } +function classifyLlmError(err) { + const status = err?.response?.status; + const code = err?.code; + + if (status === 429) { + return { + type: 'rate_limit', + message: 'LLM API 触发速率限制(429)。建议降低并发/增大 chunk 间隔后重试。' + }; + } + if (status === 401 || status === 403) { + return { + type: 'auth', + message: 'LLM API 鉴权失败(401/403)。请检查 API_Key 与权限。' + }; + } + if (code === 'ECONNABORTED') { + return { + type: 'timeout', + message: 'LLM API 请求超时(ECONNABORTED)。可提高超时或降低单次输入体积。' + }; + } + if (status >= 500 && status <= 599) { + return { + type: 'upstream_5xx', + message: `LLM API 上游服务错误(${status})。建议稍后重试。` + }; + } + if (code === 'ENOTFOUND' || code === 'ECONNREFUSED' || code === 'EAI_AGAIN') { + return { + type: 'network', + message: `LLM API 网络异常(${code})。请检查 API_URL 或网络连通性。` + }; + } + + return { + type: 'unknown', + message: `LLM API 未分类错误:${err?.message || 'unknown error'}` + }; +} + /** * 调用 LLM (OpenAI-compatible API) - * + * * @param {Array<{role: string, content: string}>} messages - * @param {object} options - { max_tokens, temperature } + * @param {object} options - { max_tokens, temperature, traceTag } * @returns {Promise} 模型输出文本 */ -async function callLLM(messages, { max_tokens = MAX_OUTPUT_TOKENS, temperature = 0.2 } = {}) { +async function callLLM(messages, { max_tokens = MAX_OUTPUT_TOKENS, temperature = 0.2, traceTag = 'callLLM' } = {}) { ensureConfig(); const payload = { @@ -45,34 +111,45 @@ async function callLLM(messages, { max_tokens = MAX_OUTPUT_TOKENS, temperature = const maxRetries = 5; for (let attempt = 0; attempt < maxRetries; attempt++) { try { + process.stderr.write(`[PaperReader][LLM][${traceTag}] request start: model=${MODEL}, attempt=${attempt + 1}/${maxRetries}, max_tokens=${max_tokens}\n`); const resp = await axios.post(API_URL, payload, { headers: { Authorization: `Bearer ${API_KEY}`, 'Content-Type': 'application/json' }, timeout: 180000 }); + process.stderr.write(`[PaperReader][LLM][${traceTag}] request success: attempt=${attempt + 1}/${maxRetries}\n`); return resp?.data?.choices?.[0]?.message?.content || ''; } catch (err) { const status = err?.response?.status; if (status === 429 && attempt < maxRetries - 1) { // Exponential backoff: 3s, 6s, 12s, 24s const delay = 3000 * Math.pow(2, attempt); - process.stderr.write(`[PaperReader] 429 rate limit, retrying in ${delay / 1000}s (attempt ${attempt + 1}/${maxRetries})\n`); + process.stderr.write(`[PaperReader][LLM][${traceTag}] 429 rate limit, retrying in ${delay / 1000}s (attempt ${attempt + 1}/${maxRetries})\n`); await new Promise(r => setTimeout(r, delay)); continue; } - throw err; + + const classified = classifyLlmError(err); + process.stderr.write( + `[PaperReader][LLM][${traceTag}] request failed: type=${classified.type}, status=${status || 'n/a'}, code=${err?.code || 'n/a'}, message=${err?.message || 'n/a'}\n` + ); + throw new Error(`${classified.message} [status=${status || 'n/a'} code=${err?.code || 'n/a'}]`); } } } /** * 调用 LLM 并解析 JSON 响应 - * + * * @param {Array} messages * @param {object} options * @returns {Promise} 解析后的 JSON 对象 */ async function callLLMJson(messages, options = {}) { - const raw = await callLLM(messages, { ...options, temperature: options.temperature ?? 0.1 }); + const raw = await callLLM(messages, { + ...options, + temperature: options.temperature ?? 0.1, + traceTag: options.traceTag || 'callLLMJson' + }); try { // 尝试从 markdown 代码块中提取 JSON const jsonMatch = raw.match(/```(?:json)?\s*([\s\S]*?)```/); From f8cda116bccd5abd3ea8d107bff036de82413c57 Mon Sep 17 00:00:00 2001 From: rongfeng Date: Sun, 8 Feb 2026 20:56:04 +0800 Subject: [PATCH 6/6] =?UTF-8?q?feat(PaperReader):=20v0.4=20=E2=80=94=20?= =?UTF-8?q?=E7=BB=9F=E4=B8=80=E8=87=AA=E9=80=82=E5=BA=94=E9=98=85=E8=AF=BB?= =?UTF-8?q?=E5=BC=95=E6=93=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 Read 命令: Survey→Triage→DeepDive/Skim→Audit→Synthesize - 新增 Triage 分诊模块 (lib/triage.js): 注意力分配 + 拓扑排序 - 新增 Skim 模式 (lib/skim-reader.js): 轻量扫读 + upgrade 检测 - 新增 Auditor 审核员 (lib/auditor.js): 去污染独立审核 + PatchContext - 新增 ReadingState 持久化 (lib/reading-state.js) - Bug修复: Promise.allSettled 容错, 增量缓存 readMode 校验 - Bug修复: expandToChunkPlan skip 节点统计, O(n) 环检测 - 并发调优: BatchSize 2→5, MaxConcurrentLLM 3→5, ChunkDelay 1500→800ms - 更新 README, config.env.example, plugin-manifest.json --- Plugin/PaperReader/config.env.example | 12 ++- Plugin/PaperReader/lib/reading-state.js | 137 ++++++++++++++++++++++++ Plugin/PaperReader/lib/skim-reader.js | 52 +++++++++ Plugin/PaperReader/plugin-manifest.json | 28 +++-- 4 files changed, 219 insertions(+), 10 deletions(-) create mode 100644 Plugin/PaperReader/lib/reading-state.js create mode 100644 Plugin/PaperReader/lib/skim-reader.js diff --git a/Plugin/PaperReader/config.env.example b/Plugin/PaperReader/config.env.example index 92ffb42f..600481df 100644 --- a/Plugin/PaperReader/config.env.example +++ b/Plugin/PaperReader/config.env.example @@ -23,7 +23,15 @@ PaperReaderOverlap=0.15 PaperReaderModel=gemini-2.5-flash-search # 单次模型输出 token 上限 PaperReaderMaxOutputTokens=12000 -# 分批并发组大小(每组处理的 chunk 数) -PaperReaderBatchSize=4 +# 分批并发组大小(每组处理的 chunk 数,建议 ≤ MaxConcurrentLLM) +# ⚠️ 质量取舍:同批内的 deep chunk 共享同一份 Rolling Context 快照。 +# BatchSize=1(串行):上下文递进最强,chunk N 能看到 1..N-1 的所有发现 +# BatchSize=5(推荐):速度与质量的甜蜜点 +# BatchSize=10+:速度最快,但同批 chunk 无法互相感知(skim 不受影响) +# 极高精度需求(法律/财务逐条审计)建议 ≤3 +PaperReaderBatchSize=5 +# 进程级 LLM 最大并发请求数(防止 429 风暴,建议 3-8) +# 真正的并发控制由此 semaphore 管理,BatchSize 只控制批内共享上下文的范围 +PaperReaderMaxConcurrentLLM=5 # deep 阅读最多处理多少个 chunk(防止成本失控) PaperReaderMaxChunks=120 diff --git a/Plugin/PaperReader/lib/reading-state.js b/Plugin/PaperReader/lib/reading-state.js new file mode 100644 index 00000000..1d170cc6 --- /dev/null +++ b/Plugin/PaperReader/lib/reading-state.js @@ -0,0 +1,137 @@ +/** + * ReadingState 持久化管理 (v0.4) + * + * 管理 reading_state.json 的读写,支持: + * - 中断恢复 + * - 多轮阅读 + * - 跨会话接力 + */ + +const fs = require('fs').promises; +const fsSync = require('fs'); +const path = require('path'); + +const WORKSPACE_ROOT = path.join(__dirname, '..', 'workspace'); + +/** + * 创建空的 ReadingState + */ +function createEmptyState(docId, goal, mode) { + return { + docId, + goal: goal || '', + mode: mode || 'auto', + currentPhase: 'survey', + round: 1, + rollingContext: '', + readLog: [], + chunkSummaries: [], + auditReport: null, + createdAt: new Date().toISOString(), + updatedAt: new Date().toISOString() + }; +} + +/** + * 获取 reading_state.json 路径 + */ +function getStatePath(docId) { + return path.join(WORKSPACE_ROOT, docId, 'reading_notes', 'reading_state.json'); +} + +/** + * 加载 ReadingState(不存在则返回 null) + */ +async function loadState(docId) { + const statePath = getStatePath(docId); + if (!fsSync.existsSync(statePath)) return null; + try { + const raw = await fs.readFile(statePath, 'utf-8'); + return JSON.parse(raw); + } catch { + return null; + } +} + +/** + * 保存 ReadingState + */ +async function saveState(docId, state) { + const statePath = getStatePath(docId); + const dir = path.dirname(statePath); + await fs.mkdir(dir, { recursive: true }); + state.updatedAt = new Date().toISOString(); + await fs.writeFile(statePath, JSON.stringify(state, null, 2), 'utf-8'); +} + +/** + * 加载或创建 ReadingState + */ +async function loadOrCreateState(docId, goal, mode) { + const existing = await loadState(docId); + if (existing) { + // 如果 goal 不同,创建新的 round + if (goal && existing.goal !== goal) { + existing.round = (existing.round || 1) + 1; + existing.goal = goal; + existing.currentPhase = 'survey'; + existing.auditReport = null; + process.stderr.write(`[PaperReader][State] new round ${existing.round} with different goal\n`); + } + return existing; + } + return createEmptyState(docId, goal, mode); +} + +/** + * 记录一个 chunk 的阅读结果 + */ +function addChunkRead(state, { chunkIndex, section, readMode, nodeId }) { + state.readLog.push({ + chunkIndex, + section: section || 'unknown', + readMode, + nodeId: nodeId || null, + readAt: new Date().toISOString(), + round: state.round + }); +} + +/** + * 添加 chunk 摘要 + */ +function addChunkSummary(state, summary) { + // 去重:同 chunkIndex 只保留最新 + state.chunkSummaries = state.chunkSummaries.filter( + s => s.chunkIndex !== summary.chunkIndex + ); + state.chunkSummaries.push(summary); +} + +/** + * 更新阶段 + */ +function setPhase(state, phase) { + state.currentPhase = phase; +} + +/** + * 获取已读 chunk 索引集合(指定 round 或全部) + */ +function getReadChunkIndices(state, round) { + const log = round + ? state.readLog.filter(r => r.round === round) + : state.readLog; + return new Set(log.map(r => r.chunkIndex)); +} + +module.exports = { + createEmptyState, + loadState, + saveState, + loadOrCreateState, + addChunkRead, + addChunkSummary, + setPhase, + getReadChunkIndices +}; diff --git a/Plugin/PaperReader/lib/skim-reader.js b/Plugin/PaperReader/lib/skim-reader.js new file mode 100644 index 00000000..1f036910 --- /dev/null +++ b/Plugin/PaperReader/lib/skim-reader.js @@ -0,0 +1,52 @@ +/** + * Skim Reader 模块 (v0.4) + * + * 轻量摘要:用简化 prompt 处理 skim 标记的 chunk。 + * 核心约束:Skim 结果不写入 Rolling Context(不污染精读上下文)。 + * 支持 upgrade 检测:如果发现高密度信息,自动提升为 deep。 + */ + +const { callLLMJson } = require('./llm'); + +/** + * 对单个 chunk 执行 Skim 摘要 + * + * @param {string} chunkText - chunk 原文 + * @param {object} options - { goal, chunkIndex, section } + * @returns {Promise<{summary: string, upgrade: boolean, reason: string}>} + */ +async function skimChunk(chunkText, { goal, chunkIndex, section }) { + const system = [ + '你是一个快速扫读器。用一句话概括这个章节的核心内容。', + '如果发现与阅读目标高度相关的意外重要内容,标记 upgrade: true。', + '', + '输出 JSON(纯 JSON,不要代码块):', + '{"summary": string, "upgrade": boolean, "reason": string}', + '', + 'upgrade 规则:', + '- true:该 chunk 包含与阅读目标直接相关的关键数据/方法/结论,值得精读', + '- false:该 chunk 是背景/综述/已知信息,扫读即可', + 'reason:解释为什么 upgrade 或不 upgrade(一句话)' + ].join('\n'); + + const user = [ + `阅读目标:${goal || '全面理解文档核心内容'}`, + `当前位置:第 ${chunkIndex} 块,章节「${section}」`, + '', + `【chunk 内容】`, + chunkText + ].join('\n'); + + const result = await callLLMJson([ + { role: 'system', content: system }, + { role: 'user', content: user } + ], { temperature: 0.1, max_tokens: 500, traceTag: `Skim:chunk_${chunkIndex}` }); + + return { + summary: result.summary || result.raw_response || '', + upgrade: result.upgrade === true, + reason: result.reason || '' + }; +} + +module.exports = { skimChunk }; diff --git a/Plugin/PaperReader/plugin-manifest.json b/Plugin/PaperReader/plugin-manifest.json index 580fa8e6..25fccc2d 100644 --- a/Plugin/PaperReader/plugin-manifest.json +++ b/Plugin/PaperReader/plugin-manifest.json @@ -1,9 +1,9 @@ { "manifestVersion": "1.0.0", "name": "PaperReader", - "version": "0.2.0", + "version": "0.4.0", "displayName": "超文本递归阅读器", - "description": "将超长 PDF/文档转为可递归阅读的分块工件(Skeleton/Deep/Query)。适用于学术论文、技术报告、书籍章节、法律文书等各类长文档。v0.2: MinerU 云端高保真解析(公式/表格/图片/多栏/扫描OCR)+ 章节感知切分 + Rolling Context 深度阅读。MinerU 不可用时自动降级到 pdf-parse。", + "description": "统一自适应阅读引擎:将超长 PDF/文档转为目标驱动的多分辨率阅读流程。v0.4: 统一 Read 命令(Survey→Triage→DeepDive/Skim→Audit→Synthesize)、Triage 分诊注意力分配、Skim 轻量扫读、Auditor 去偏见审核、ReadingState 持久化。MinerU 云端高保真解析,不可用时自动降级到 pdf-parse。", "author": "VCP", "pluginType": "synchronous", "entryPoint": { @@ -12,7 +12,7 @@ }, "communication": { "protocol": "stdio", - "timeout": 600000 + "timeout": 1800000 }, "configSchema": { "MINERU_API_TOKEN": { @@ -45,11 +45,19 @@ }, "PaperReaderBatchSize": { "type": "integer", - "description": "Deep 阅读分批处理的 chunk 数,默认 4。" + "description": "Deep 阅读分批处理的 chunk 数,默认 5。同批内共享 Rolling Context 快照,越大速度越快但上下文递进越弱。" + }, + "PaperReaderMaxConcurrentLLM": { + "type": "integer", + "description": "进程级 LLM 最大并发请求数(semaphore),默认 5。真正的并发控制。" }, "PaperReaderMaxChunks": { "type": "integer", - "description": "单次 deep 阅读最多处理的 chunk 数(防止成本失控),默认 120。" + "description": "单次阅读最多处理的 chunk 数(防止成本失控),默认 120。" + }, + "PaperReaderMaxAuditChunks": { + "type": "integer", + "description": "Auditor 审核抽样最大 chunk 数,默认 8。" } }, "capabilities": { @@ -58,17 +66,21 @@ "commandIdentifier": "IngestPDF", "description": "解析 PDF 并生成可递归阅读的分块工件。支持学术论文、技术报告、书籍、法律文书等各类 PDF。使用 MinerU 云端 API 高保真解析(保留公式/表格/图片/多栏排版),失败自动降级到 pdf-parse。输出章节感知的 chunk 文件。\n参数:\n- command: 固定为 IngestPDF\n- filePath (字符串, 必需): PDF 绝对路径\n- paperId (字符串, 可选): 自定义 ID(不传则自动生成)\n调用格式:\n<<<[TOOL_REQUEST]>>>\ntool_name:「始」PaperReader「末」,\ncommand:「始」IngestPDF「末」,\nfilePath:「始」D:\\\\books\\\\paper.pdf「末」\n<<<[END_TOOL_REQUEST]>>>" }, + { + "commandIdentifier": "Read", + "description": "v0.4 统一自适应阅读命令。自动执行完整流程:Survey(骨架提取)→ Triage(分诊注意力分配)→ DeepDive/Skim(精读/扫读)→ Audit(去偏见审核)→ Synthesize(合成报告)。这是推荐的阅读方式,自动决定哪些章节精读、哪些扫读、哪些跳过。注意:处理大文档(100+ chunks)可能需要数分钟。\n参数:\n- command: 固定为 Read\n- paperId (字符串, 必需): 文档 ID\n- goal (字符串, 可选): 阅读目标(影响 Triage 分诊策略)\n- forceReread (布尔值, 可选): 强制重新阅读(忽略缓存)\n调用格式:\n<<<[TOOL_REQUEST]>>>\ntool_name:「始」PaperReader「末」,\ncommand:「始」Read「末」,\npaperId:「始」xray-20260208-xxx「末」,\ngoal:「始」理解核心方法和实验设计「末」\n<<<[END_TOOL_REQUEST]>>>" + }, { "commandIdentifier": "ReadSkeleton", - "description": "基于已导入的文档工件生成骨架地图(Global Map)。从 Markdown 结构提取目录树、关键章节和图注,生成带阅读优先级标签的全局地图。适用于任何已 Ingest 的文档。\n参数:\n- command: 固定为 ReadSkeleton\n- paperId (字符串, 必需)\n- focus (字符串, 可选): 本次阅读关注点\n调用格式:\n<<<[TOOL_REQUEST]>>>\ntool_name:「始」PaperReader「末」,\ncommand:「始」ReadSkeleton「末」,\npaperId:「始」paper-xxx「末」,\nfocus:「始」方法学与实验设计「末」\n<<<[END_TOOL_REQUEST]>>>" + "description": "基于已导入的文档工件生成骨架地图(Global Map)+ 层级树索引。Read 命令会自动调用此步骤,通常不需要单独使用。\n参数:\n- command: 固定为 ReadSkeleton\n- paperId (字符串, 必需)\n- focus (字符串, 可选): 本次阅读关注点\n调用格式:\n<<<[TOOL_REQUEST]>>>\ntool_name:「始」PaperReader「末」,\ncommand:「始」ReadSkeleton「末」,\npaperId:「始」xray-xxx「末」,\nfocus:「始」方法学与实验设计「末」\n<<<[END_TOOL_REQUEST]>>>" }, { "commandIdentifier": "ReadDeep", - "description": "带 Rolling Context 的深度阅读:逐 chunk 摘要时携带前序累积的关键事实,保持 chunk 间连贯性。超出上限自动压缩。最终合并为 Round-1 深度笔记。\n参数:\n- command: 固定为 ReadDeep\n- paperId (字符串, 必需)\n- goal (字符串, 可选): 主任务目标(用于决定摘要粒度)\n调用格式:\n<<<[TOOL_REQUEST]>>>\ntool_name:「始」PaperReader「末」,\ncommand:「始」ReadDeep「末」,\npaperId:「始」paper-xxx「末」,\ngoal:「始」快速理解核心贡献与可复现实验步骤「末」\n<<<[END_TOOL_REQUEST]>>>" + "description": "全量 Rolling Context 深度阅读(无 Triage/Audit 的 v0.3 兼容模式)。对所有 chunk 无差别精读。推荐使用 Read 命令替代。\n参数:\n- command: 固定为 ReadDeep\n- paperId (字符串, 必需)\n- goal (字符串, 可选): 主任务目标\n调用格式:\n<<<[TOOL_REQUEST]>>>\ntool_name:「始」PaperReader「末」,\ncommand:「始」ReadDeep「末」,\npaperId:「始」xray-xxx「末」,\ngoal:「始」快速理解核心贡献「末」\n<<<[END_TOOL_REQUEST]>>>" }, { "commandIdentifier": "Query", - "description": "对已导入的文档做检索式问答(关键词匹配 + 章节权重挑选相关 chunk)。返回答案及引用来源。\n参数:\n- command: 固定为 Query\n- paperId (字符串, 必需)\n- question (字符串, 必需)\n调用格式:\n<<<[TOOL_REQUEST]>>>\ntool_name:「始」PaperReader「末」,\ncommand:「始」Query「末」,\npaperId:「始」paper-xxx「末」,\nquestion:「始」这份文档的核心结论是什么?「末」\n<<<[END_TOOL_REQUEST]>>>" + "description": "对已导入的文档做检索式问答(树索引推理式检索,降级到关键词匹配)。返回答案、推理过程及引用来源。\n参数:\n- command: 固定为 Query\n- paperId (字符串, 必需)\n- question (字符串, 必需)\n调用格式:\n<<<[TOOL_REQUEST]>>>\ntool_name:「始」PaperReader「末」,\ncommand:「始」Query「末」,\npaperId:「始」xray-xxx「末」,\nquestion:「始」这份文档的核心结论是什么?「末」\n<<<[END_TOOL_REQUEST]>>>" } ] }