diff --git a/Plugin/PaperReader/PaperReader.js b/Plugin/PaperReader/PaperReader.js new file mode 100644 index 00000000..e6c9bb51 --- /dev/null +++ b/Plugin/PaperReader/PaperReader.js @@ -0,0 +1,211 @@ +/** + * PaperReader v0.2 — 主入口 + * + * stdin 接收 JSON → 路由到各 command handler → stdout 输出 JSON + */ + +const fs = require('fs').promises; +const fsSync = require('fs'); +const path = require('path'); +const crypto = require('crypto'); + +require('dotenv').config({ path: path.join(__dirname, 'config.env') }); +require('dotenv').config({ path: path.join(__dirname, '..', '..', 'config.env') }); + +const { ingestPdf } = require('./lib/ingest'); +const { chunkMarkdown } = require('./lib/chunker'); +const { generateSkeleton } = require('./lib/skeleton'); +const { readDeep } = require('./lib/deep-reader'); +const { queryPaper } = require('./lib/query'); + +const WORKSPACE_ROOT = path.join(__dirname, 'workspace'); + +function sendResponse(data) { + process.stdout.write(JSON.stringify(data)); + process.exit(0); +} + +function sha1(input) { + return crypto.createHash('sha1').update(input).digest('hex'); +} + +function getPaperWorkspace(paperId) { + return path.join(WORKSPACE_ROOT, paperId); +} + +async function writeJson(filePath, obj) { + await fs.writeFile(filePath, JSON.stringify(obj, null, 2), 'utf-8'); +} + +// ─── Command Handlers ─── + +async function handleIngestPDF({ filePath, paperId, forceReparse }) { + if (!filePath || typeof filePath !== 'string') { + throw new Error('IngestPDF requires filePath'); + } + + const abs = path.isAbsolute(filePath) ? filePath : path.resolve(process.cwd(), filePath); + if (!fsSync.existsSync(abs)) { + throw new Error(`PDF not found: ${abs}`); + } + + const resolvedPaperId = paperId && String(paperId).trim() + ? String(paperId).trim() + : `paper-${sha1(abs).slice(0, 10)}`; + + const wsDir = getPaperWorkspace(resolvedPaperId); + const manifestPath = path.join(wsDir, 'chunks', 'manifest.json'); + const metaPath = path.join(wsDir, 'meta.json'); + + // ── Cache check: if manifest + meta already exist, skip re-parsing ── + if (!forceReparse && fsSync.existsSync(manifestPath) && fsSync.existsSync(metaPath)) { + const existingMeta = JSON.parse(await fs.readFile(metaPath, 'utf-8')); + const existingManifest = JSON.parse(await fs.readFile(manifestPath, 'utf-8')); + process.stderr.write(`[PaperReader][Ingest] cache hit: paperId=${resolvedPaperId}, chunkCount=${existingManifest.chunkCount}, engine=${existingMeta.engine}\n`); + return { + paperId: resolvedPaperId, + workspace: wsDir, + pageCount: existingMeta.pageCount, + chunkCount: existingManifest.chunkCount, + engine: existingMeta.engine, + cached: true + }; + } + + process.stderr.write(`[PaperReader][Ingest] no cache, starting full parse: paperId=${resolvedPaperId}\n`); + + await fs.mkdir(wsDir, { recursive: true }); + + // L0: 解析 PDF → Markdown + Figures + const parsed = await ingestPdf(abs, { outputDir: wsDir }); + + // Save meta + const meta = { + paperId: resolvedPaperId, + sourceFilePath: abs, + extractedAt: new Date().toISOString(), + pageCount: parsed.pageCount, + textLength: (parsed.markdown || '').length, + engine: parsed.engine + }; + await writeJson(metaPath, meta); + + // Save full markdown + await fs.writeFile(path.join(wsDir, 'full_text.md'), parsed.markdown || '', 'utf-8'); + + // Save figure map + if (parsed.figureMap && parsed.figureMap.length > 0) { + await writeJson(path.join(wsDir, 'figure_map.json'), parsed.figureMap); + } + + // L1: 章节感知切分 + const chunks = chunkMarkdown(parsed.markdown || ''); + + // Save chunks + const chunksDir = path.join(wsDir, 'chunks'); + await fs.mkdir(chunksDir, { recursive: true }); + + for (const chunk of chunks) { + const chunkContent = chunk.metaHeader + ? `${chunk.metaHeader}\n\n---\n\n${chunk.text}` + : chunk.text; + await fs.writeFile( + path.join(chunksDir, `chunk_${chunk.index}.md`), + chunkContent, + 'utf-8' + ); + } + + // Save manifest + const manifest = { + chunkCount: chunks.length, + chunks: chunks.map(c => ({ + index: c.index, + section: c.section, + tokenCount: c.tokenCount + })) + }; + await writeJson(manifestPath, manifest); + + // Create reading_notes dir + await fs.mkdir(path.join(wsDir, 'reading_notes'), { recursive: true }); + + return { + paperId: resolvedPaperId, + workspace: wsDir, + pageCount: meta.pageCount, + chunkCount: chunks.length, + engine: parsed.engine, + cached: false + }; +} + +async function handleReadSkeleton({ paperId, focus }) { + if (!paperId) throw new Error('ReadSkeleton requires paperId'); + const result = await generateSkeleton(paperId, { focus }); + return { paperId, globalMapPath: result.globalMapPath, content: result.globalMapContent }; +} + +async function handleReadDeep({ paperId, goal, maxChunks, batchSize, forceReread }) { + if (!paperId) throw new Error('ReadDeep requires paperId'); + const opts = { goal }; + if (maxChunks) opts.maxChunks = maxChunks; + if (batchSize) opts.batchSize = batchSize; + if (forceReread) opts.forceReread = true; + const result = await readDeep(paperId, opts); + // Read the Round_1_Summary.md to return its content + const summaryContent = fsSync.existsSync(result.roundPath) + ? (await fs.readFile(result.roundPath, 'utf-8')) + : ''; + return { ...result, content: summaryContent }; +} + +async function handleQuery({ paperId, question }) { + return await queryPaper(paperId, question); +} + +// ─── Main ─── + +async function main() { + let inputData = ''; + process.stdin.setEncoding('utf8'); + for await (const chunk of process.stdin) inputData += chunk; + + const request = JSON.parse(inputData || '{}'); + const command = request.command; + + process.stderr.write(`[PaperReader][Main] request received: command=${command || 'undefined'}, paperId=${request.paperId || 'n/a'}\n`); + + try { + if (!command) throw new Error('Missing command'); + + let result; + switch (command) { + case 'IngestPDF': + process.stderr.write('[PaperReader][Main] route hit: IngestPDF\n'); + result = await handleIngestPDF({ filePath: request.filePath, paperId: request.paperId, forceReparse: request.forceReparse }); + break; + case 'ReadSkeleton': + process.stderr.write('[PaperReader][Main] route hit: ReadSkeleton\n'); + result = await handleReadSkeleton({ paperId: request.paperId, focus: request.focus }); + break; + case 'ReadDeep': + process.stderr.write('[PaperReader][Main] route hit: ReadDeep\n'); + result = await handleReadDeep({ paperId: request.paperId, goal: request.goal, maxChunks: request.maxChunks, batchSize: request.batchSize, forceReread: request.forceReread }); + break; + case 'Query': + process.stderr.write('[PaperReader][Main] route hit: Query\n'); + result = await handleQuery({ paperId: request.paperId, question: request.question }); + break; + default: + throw new Error(`Unknown command: ${command}`); + } + + sendResponse({ status: 'success', result }); + } catch (err) { + process.stderr.write(`[PaperReader][Main] request failed: command=${command || 'undefined'}, error=${err?.message || String(err)}\n`); + sendResponse({ status: 'error', error: err?.message || String(err) }); + } +} + +main(); diff --git a/Plugin/PaperReader/README.md b/Plugin/PaperReader/README.md new file mode 100644 index 00000000..327290bd --- /dev/null +++ b/Plugin/PaperReader/README.md @@ -0,0 +1,68 @@ +# PaperReader(v0.2) + +## 设计目标 + +将超长 PDF / 文档转为可控的递归阅读流程。适用于学术论文、技术报告、法律文书、书籍章节等各类长文档。 + +1. **L0 解析层**:MinerU 云端 API 高保真解析(保留公式/表格/图片/多栏排版),自动降级到 pdf-parse +2. **L1 切分层**:章节感知切分 + Meta-Header 注入 + 10-20% overlap +3. **L2 递归逻辑层**:Skeleton 骨架提取 / Rolling Context 深度阅读 / 合并综合 +4. **L3 存储交互层**:Obsidian 友好的 Markdown 目录结构 + +## 命令 + +| 命令 | 功能 | +|------|------| +| `IngestPDF` | PDF → Markdown → 章节感知 chunks | +| `ReadSkeleton` | 从目录/摘要/关键章节生成 Global Map | +| `ReadDeep` | 带 Rolling Context 的递归摘要 → Round-1 笔记 | +| `Query` | 检索式问答(关键词匹配 + 章节权重) | + +## 工件目录 + +``` +workspace/{paperId}/ +├── meta.json # 元数据(含解析引擎标识) +├── full_text.md # 完整 Markdown(L0 输出) +├── figure_map.json # Figure_ID ↔ Caption 映射 +├── assets/ +│ └── figures/ # 提取的图片 +├── chunks/ +│ ├── manifest.json # chunk 清单 + 章节映射 +│ └── chunk_{i}.md # 单个 chunk(含 Meta-Header) +└── reading_notes/ + ├── Global_Map.md # 骨架地图 + ├── Chunk_Summaries.json # 分块摘要 + └── Round_1_Summary.md # 深度笔记 +``` + +## 配置 + +复制 `config.env.example` 为 `config.env` 并填入: +- `MINERU_API_TOKEN`:MinerU 云端 API Token(不填则自动降级) +- `PaperReaderModel`:LLM 模型名称 +- 详见 `config.env.example` 中的完整配置项 + +## 依赖 + +- `axios`:HTTP 请求 +- `pdf-parse`:降级模式 PDF 解析 +- `adm-zip`:解压 MinerU 返回的 zip +- `@dqbd/tiktoken`:token 计数 +- `dotenv`:环境变量 + +## 支持的文档类型 + +MinerU 云端 API 支持解析: +- 学术论文(多栏、公式、引用) +- 技术报告 / 白皮书 +- 书籍章节 +- 法律文书 / 合同 +- 扫描版 PDF(内置 OCR) +- 含复杂表格的文档 + +## 常见限制 + +- MinerU 免费额度:每日 2000 页,单文件 200MB/600 页 +- Rolling Context 上限 4000 tokens,超出自动压缩 +- Query 目前为关键词匹配(向量检索计划在 Phase 2) diff --git a/Plugin/PaperReader/config.env.example b/Plugin/PaperReader/config.env.example new file mode 100644 index 00000000..600481df --- /dev/null +++ b/Plugin/PaperReader/config.env.example @@ -0,0 +1,37 @@ +# PaperReader 插件配置(示例) + +# === L0 解析层 === +# MinerU 云端 API Token(从 mineru.net 网站的「个人中心 → API密钥管理」获取) +# 注意:这里填的是 Bearer Token(一串长字符串),不是 Access Key / Secret Key +# 不填则自动降级到 pdf-parse 纯文本模式 +MINERU_API_TOKEN= +# MinerU 模型版本:pipeline(默认,速度快)或 vlm(效果更好,速度较慢) +MINERU_MODEL_VERSION=pipeline +# 轮询超时(ms),默认 5 分钟 +MINERU_API_TIMEOUT=300000 +# 轮询间隔(ms),默认 5 秒 +MINERU_POLL_INTERVAL=5000 + +# === L1 切分层 === +# 目标 chunk 大小(tokens) +PaperReaderChunkSize=2000 +# chunk 重叠比例 +PaperReaderOverlap=0.15 + +# === L2 递归逻辑层 === +# 读取/总结模型(使用 VCP 的 API_URL/API_Key 调用 /v1/chat/completions) +PaperReaderModel=gemini-2.5-flash-search +# 单次模型输出 token 上限 +PaperReaderMaxOutputTokens=12000 +# 分批并发组大小(每组处理的 chunk 数,建议 ≤ MaxConcurrentLLM) +# ⚠️ 质量取舍:同批内的 deep chunk 共享同一份 Rolling Context 快照。 +# BatchSize=1(串行):上下文递进最强,chunk N 能看到 1..N-1 的所有发现 +# BatchSize=5(推荐):速度与质量的甜蜜点 +# BatchSize=10+:速度最快,但同批 chunk 无法互相感知(skim 不受影响) +# 极高精度需求(法律/财务逐条审计)建议 ≤3 +PaperReaderBatchSize=5 +# 进程级 LLM 最大并发请求数(防止 429 风暴,建议 3-8) +# 真正的并发控制由此 semaphore 管理,BatchSize 只控制批内共享上下文的范围 +PaperReaderMaxConcurrentLLM=5 +# deep 阅读最多处理多少个 chunk(防止成本失控) +PaperReaderMaxChunks=120 diff --git a/Plugin/PaperReader/lib/chunker.js b/Plugin/PaperReader/lib/chunker.js new file mode 100644 index 00000000..2b2f61c8 --- /dev/null +++ b/Plugin/PaperReader/lib/chunker.js @@ -0,0 +1,183 @@ +/** + * 章节感知切分器 (T3) + * + * 按 Markdown 章节标题(##)切分,超长章节在段落边界二次切分。 + * 每个 chunk 注入 Meta-Header(章节名 + 全局摘要占位 + overlap)。 + * 使用 tiktoken cl100k_base 计算 token 数。 + */ + +const { get_encoding } = require('@dqbd/tiktoken'); + +const encoding = get_encoding('cl100k_base'); + +const DEFAULT_TARGET_TOKENS = 2000; +const DEFAULT_OVERLAP_RATIO = 0.15; +const DEFAULT_MAX_CHUNKS = 120; + +/** + * 计算文本的 token 数 + */ +function countTokens(text) { + if (!text) return 0; + return encoding.encode(text).length; +} + +/** + * 从 Markdown 中提取章节结构 + * @returns {Array<{ level: number, title: string, content: string }>} + */ +function extractSections(markdown) { + const lines = markdown.split('\n'); + const sections = []; + let currentSection = { level: 0, title: '(Preamble)', lines: [] }; + + for (const line of lines) { + const headerMatch = line.match(/^(#{1,4})\s+(.+)$/); + if (headerMatch) { + // Save previous section + if (currentSection.lines.length > 0 || currentSection.title !== '(Preamble)') { + sections.push({ + level: currentSection.level, + title: currentSection.title, + content: currentSection.lines.join('\n') + }); + } + currentSection = { + level: headerMatch[1].length, + title: headerMatch[2].trim(), + lines: [line] + }; + } else { + currentSection.lines.push(line); + } + } + + // Push last section + if (currentSection.lines.length > 0) { + sections.push({ + level: currentSection.level, + title: currentSection.title, + content: currentSection.lines.join('\n') + }); + } + + return sections; +} + +/** + * 在段落边界切分超长文本 + * @returns {string[]} + */ +function splitAtParagraphs(text, targetTokens) { + const paragraphs = text.split(/\n\n+/); + const pieces = []; + let current = ''; + let currentTokens = 0; + + for (const para of paragraphs) { + const paraTokens = countTokens(para); + + if (currentTokens + paraTokens > targetTokens && current.trim()) { + pieces.push(current.trim()); + current = ''; + currentTokens = 0; + } + + // Handle single paragraph exceeding limit + if (paraTokens > targetTokens && !current.trim()) { + const sentences = para.split(/(?<=[。?!.!?\n])/g); + for (const sent of sentences) { + const sentTokens = countTokens(sent); + if (currentTokens + sentTokens > targetTokens && current.trim()) { + pieces.push(current.trim()); + current = ''; + currentTokens = 0; + } + current += sent; + currentTokens += sentTokens; + } + continue; + } + + current += (current ? '\n\n' : '') + para; + currentTokens += paraTokens; + } + + if (current.trim()) { + pieces.push(current.trim()); + } + + return pieces; +} + +/** + * 生成 Meta-Header + */ +function makeMetaHeader(section, globalSummary, overlapText) { + const parts = [`[章节: ${section}]`]; + if (globalSummary) { + parts.push(`[全局摘要: ${globalSummary}]`); + } + if (overlapText) { + parts.push(`[上文衔接: ...${overlapText.slice(-200)}]`); + } + return parts.join('\n'); +} + +/** + * 章节感知切分 + * + * @param {string} markdown - L0 输出的 Markdown + * @param {object} options - { targetTokens, overlapRatio, maxChunks, globalSummary } + * @returns {Array<{ index, section, tokenCount, text, metaHeader }>} + */ +function chunkMarkdown(markdown, options = {}) { + const targetTokens = options.targetTokens || DEFAULT_TARGET_TOKENS; + const overlapRatio = options.overlapRatio || DEFAULT_OVERLAP_RATIO; + const maxChunks = options.maxChunks || DEFAULT_MAX_CHUNKS; + const globalSummary = options.globalSummary || ''; + + if (!markdown || !markdown.trim()) return []; + + const sections = extractSections(markdown); + const chunks = []; + let prevTail = ''; + + for (const section of sections) { + const sectionTokens = countTokens(section.content); + + if (sectionTokens <= targetTokens) { + const metaHeader = makeMetaHeader(section.title, globalSummary, prevTail); + const text = section.content; + chunks.push({ + index: chunks.length, + section: section.title, + tokenCount: countTokens(metaHeader + '\n\n' + text), + text, + metaHeader + }); + const tailLen = Math.floor(text.length * overlapRatio); + prevTail = text.slice(-tailLen); + } else { + const pieces = splitAtParagraphs(section.content, targetTokens); + for (const piece of pieces) { + const metaHeader = makeMetaHeader(section.title, globalSummary, prevTail); + chunks.push({ + index: chunks.length, + section: section.title, + tokenCount: countTokens(metaHeader + '\n\n' + piece), + text: piece, + metaHeader + }); + const tailLen = Math.floor(piece.length * overlapRatio); + prevTail = piece.slice(-tailLen); + } + } + + if (chunks.length >= maxChunks) break; + } + + return chunks.slice(0, maxChunks); +} + +module.exports = { chunkMarkdown, countTokens, extractSections }; diff --git a/Plugin/PaperReader/lib/deep-reader.js b/Plugin/PaperReader/lib/deep-reader.js new file mode 100644 index 00000000..7b0c2bb5 --- /dev/null +++ b/Plugin/PaperReader/lib/deep-reader.js @@ -0,0 +1,237 @@ +/** + * Rolling Context Deep Reader (T6) + * + * 带滚动上下文的深度阅读:每个 chunk 摘要时携带前序累积的关键事实, + * 保持 chunk 间的连贯性。超出上限时自动压缩。 + */ + +const fs = require('fs').promises; +const fsSync = require('fs'); +const path = require('path'); +const { callLLM, callLLMJson } = require('./llm'); +const { countTokens } = require('./chunker'); + +const WORKSPACE_ROOT = path.join(__dirname, '..', 'workspace'); +const BATCH_SIZE = parseInt(process.env.PaperReaderBatchSize || '4', 10); +const MAX_CHUNKS = parseInt(process.env.PaperReaderMaxChunks || '120', 10); +const ROLLING_CONTEXT_MAX_TOKENS = 4000; +const CHUNK_DELAY_MS = parseInt(process.env.PaperReaderChunkDelay || '1500', 10); + +/** + * 压缩 Rolling Context(当超过上限时) + */ +async function compressContext(rollingContext) { + const compressed = await callLLM([ + { role: 'system', content: '将以下累积的阅读笔记压缩为关键事实列表,保留最重要的信息、关键步骤和核心结论。删除冗余和过渡性描述。输出纯文本,不超过 2000 tokens。' }, + { role: 'user', content: rollingContext } + ], { max_tokens: 3000, temperature: 0.1 }); + return compressed; +} + +/** + * 对单个 chunk 做摘要(携带 Rolling Context) + */ +async function summarizeChunk(chunkText, { goal, globalMap, rollingContext, chunkIndex, section }) { + const system = [ + '你是一个"长文档分块摘要器",适用于各类文档(学术论文、技术报告、书籍、法律文书等)。', + '你会结合已有的阅读上下文,对当前 chunk 进行摘要。', + '输出 JSON(纯 JSON,不要代码块):', + '{"summary": string, "key_facts": string[], "methods": string[], "claims": string[], "open_questions": string[]}', + '其中 methods 字段可包含任何流程/步骤/操作方法(不限于科研实验),claims 包含文档中的核心论断/条款/规定。' + ].join('\n'); + + const userParts = [ + `主任务目标:${goal || '全面理解文档核心内容'}`, + `当前位置:第 ${chunkIndex} 块,章节「${section}」` + ]; + + if (rollingContext) { + userParts.push(`【已有阅读上下文】\n${rollingContext}`); + } + if (globalMap) { + userParts.push(`【全局地图摘要】\n${globalMap.slice(0, 2000)}`); + } + userParts.push(`【当前 chunk 内容】\n${chunkText}`); + + const result = await callLLMJson([ + { role: 'system', content: system }, + { role: 'user', content: userParts.join('\n\n') } + ], { temperature: 0.1, traceTag: `DeepReader:chunk_${chunkIndex}` }); + + // Normalize result + return { + summary: result.summary || result.raw_response || '', + key_facts: result.key_facts || [], + methods: result.methods || [], + claims: result.claims || [], + open_questions: result.open_questions || [] + }; +} + +/** + * 带滚动上下文的深度阅读 + * + * @param {string} paperId + * @param {object} options - { goal, batchSize, maxChunks } + * @returns {Promise<{ summariesPath, roundPath }>} + */ +async function readDeep(paperId, options = {}) { + const wsDir = path.join(WORKSPACE_ROOT, paperId); + const chunksDir = path.join(wsDir, 'chunks'); + const manifestPath = path.join(chunksDir, 'manifest.json'); + const notesDir = path.join(wsDir, 'reading_notes'); + const summariesPath = path.join(notesDir, 'Chunk_Summaries.json'); + const roundPath = path.join(notesDir, 'Round_1_Summary.md'); + + process.stderr.write(`[PaperReader][DeepReader] start: paperId=${paperId}, goal=${options.goal || '(default)'}\n`); + + // ── Cache check: if Round_1_Summary.md already exists, return directly ── + if (!options.forceReread && fsSync.existsSync(roundPath) && fsSync.existsSync(summariesPath)) { + const existingSummaries = JSON.parse(await fs.readFile(summariesPath, 'utf-8')); + process.stderr.write(`[PaperReader][DeepReader] cache hit: Round_1_Summary.md exists (${existingSummaries.count} chunk summaries). Returning cached result.\n`); + return { paperId, summariesPath, roundPath, cached: true }; + } + + if (!fsSync.existsSync(manifestPath)) { + throw new Error(`chunks/manifest.json not found: ${manifestPath}`); + } + + const manifest = JSON.parse(await fs.readFile(manifestPath, 'utf-8')); + const chunks = manifest.chunks || []; + + // Load Global Map if exists + const globalMapPath = path.join(wsDir, 'reading_notes', 'Global_Map.md'); + const globalMap = fsSync.existsSync(globalMapPath) + ? await fs.readFile(globalMapPath, 'utf-8') + : ''; + + const batchSize = options.batchSize || BATCH_SIZE; + const maxChunks = Math.min(options.maxChunks || MAX_CHUNKS, chunks.length); + const goal = options.goal || ''; + + const limited = chunks.slice(0, maxChunks); + let summaries = []; + let rollingContext = ''; + + // ── Incremental resume: load existing chunk summaries if available ── + const existingSummariesMap = new Map(); + if (!options.forceReread && fsSync.existsSync(summariesPath)) { + try { + const existing = JSON.parse(await fs.readFile(summariesPath, 'utf-8')); + if (existing.summaries && Array.isArray(existing.summaries)) { + for (const s of existing.summaries) { + existingSummariesMap.set(s.chunkIndex, s); + } + process.stderr.write(`[PaperReader][DeepReader] found ${existingSummariesMap.size} cached chunk summaries, will skip those\n`); + } + } catch { /* ignore corrupt file */ } + } + + process.stderr.write(`[PaperReader][DeepReader] config: totalChunks=${chunks.length}, processing=${limited.length}, batchSize=${batchSize}, chunkDelay=${CHUNK_DELAY_MS}ms\n`); + + // Concurrent batch processing with Rolling Context + // Each batch shares the same rolling context snapshot, chunks within a batch run in parallel. + // After a batch completes, results are merged in order to update rolling context before next batch. + for (let i = 0; i < limited.length; i += batchSize) { + const batch = limited.slice(i, i + batchSize); + const batchNum = Math.floor(i / batchSize) + 1; + const totalBatches = Math.ceil(limited.length / batchSize); + process.stderr.write(`[PaperReader][DeepReader] batch ${batchNum}/${totalBatches} start (chunks ${i}-${Math.min(i + batchSize, limited.length) - 1}, concurrency=${batch.length})\n`); + + // Delay between batches to avoid rate limiting (skip first batch) + if (i > 0) { + await new Promise(r => setTimeout(r, CHUNK_DELAY_MS)); + } + + // Snapshot rolling context for this batch — all chunks in the batch see the same context + const batchRollingContext = rollingContext; + + // Launch all chunks in this batch concurrently (skip cached ones) + const batchPromises = batch.map(async (chunk) => { + // Check incremental cache + if (existingSummariesMap.has(chunk.index)) { + process.stderr.write(`[PaperReader][DeepReader] chunk ${chunk.index}/${limited.length - 1} (section: ${chunk.section || 'unknown'}) CACHED, skipping LLM\n`); + return existingSummariesMap.get(chunk.index); + } + + // Read chunk content + const chunkPath = path.join(chunksDir, `chunk_${chunk.index}.md`); + let chunkText; + if (fsSync.existsSync(chunkPath)) { + chunkText = await fs.readFile(chunkPath, 'utf-8'); + } else { + chunkText = chunk.text || ''; + } + + process.stderr.write(`[PaperReader][DeepReader] chunk ${chunk.index}/${limited.length - 1} (section: ${chunk.section || 'unknown'}) summarizing...\n`); + + const summary = await summarizeChunk(chunkText, { + goal, + globalMap, + rollingContext: batchRollingContext, + chunkIndex: chunk.index, + section: chunk.section || 'unknown' + }); + + return { + chunkIndex: chunk.index, + section: chunk.section, + ...summary + }; + }); + + // Wait for all chunks in this batch to complete + const batchResults = await Promise.all(batchPromises); + + // Merge results in order + for (const result of batchResults) { + summaries.push(result); + process.stderr.write(`[PaperReader][DeepReader] chunk ${result.chunkIndex} done (${summaries.length}/${limited.length} completed)\n`); + + // Update Rolling Context in order + const newFacts = result.key_facts.join('; '); + if (newFacts) { + rollingContext += `\n[Chunk ${result.chunkIndex} - ${result.section}]: ${newFacts}`; + } + } + + // Compress rolling context if exceeding limit (once per batch) + if (countTokens(rollingContext) > ROLLING_CONTEXT_MAX_TOKENS) { + process.stderr.write(`[PaperReader][DeepReader] rolling context exceeds ${ROLLING_CONTEXT_MAX_TOKENS} tokens, compressing...\n`); + rollingContext = await compressContext(rollingContext); + } + } + + // Save chunk summaries + await fs.mkdir(notesDir, { recursive: true }); + await fs.writeFile(summariesPath, JSON.stringify({ count: summaries.length, summaries }, null, 2), 'utf-8'); + + process.stderr.write(`[PaperReader][DeepReader] all ${summaries.length} chunks summarized, starting synthesis...\n`); + + // Synthesis: merge all summaries into Round_1_Summary.md + const system = [ + '你是一个"长文档合并器",适用于各类文档。', + '输入是多段 chunk 的结构化摘要(含滚动上下文),请合并成一份结构化的深度笔记。', + '输出 Markdown,根据文档类型自适应包含:核心主题与结论、关键内容与论点、方法/流程/步骤(如有)、重要数据与证据、局限与风险、待解决问题清单。' + ].join('\n'); + + const user = [ + `主任务目标:${goal || '全面理解文档核心内容'}`, + globalMap ? `全局地图:\n${globalMap.slice(0, 3000)}` : '', + `最终累积上下文:\n${rollingContext}`, + `Chunk 摘要(${summaries.length} 个):\n${JSON.stringify(summaries).slice(0, 150000)}` + ].filter(Boolean).join('\n\n'); + + const merged = await callLLM([ + { role: 'system', content: system }, + { role: 'user', content: user } + ], { temperature: 0.2, traceTag: 'DeepReader:synthesis' }); + + await fs.writeFile(roundPath, merged || '', 'utf-8'); + + process.stderr.write(`[PaperReader][DeepReader] complete: summariesPath=${summariesPath}, roundPath=${roundPath}\n`); + + return { paperId, summariesPath, roundPath }; +} + +module.exports = { readDeep }; diff --git a/Plugin/PaperReader/lib/ingest.js b/Plugin/PaperReader/lib/ingest.js new file mode 100644 index 00000000..56374ad9 --- /dev/null +++ b/Plugin/PaperReader/lib/ingest.js @@ -0,0 +1,41 @@ +/** + * 统一解析入口 (T1+T2) + * + * 优先使用 MinerU 云端 API,失败则自动降级到 pdf-parse。 + */ + +const path = require('path'); +const fs = require('fs').promises; +const mineruClient = require('./mineru-client'); +const fallback = require('./pdf-parse-fallback'); + +/** + * 统一解析入口:优先 MinerU,失败则降级 + * + * @param {string} pdfPath - PDF 绝对路径 + * @param {object} options - { outputDir, token, timeout, pollInterval } + * @returns {Promise<{ markdown, figures, pageCount, figureMap, engine: 'mineru'|'pdf-parse' }>} + */ +async function ingestPdf(pdfPath, options = {}) { + const outputDir = options.outputDir || path.dirname(pdfPath); + const hasMineruToken = !!(options.token || process.env.MINERU_API_TOKEN); + + if (hasMineruToken) { + try { + const result = await mineruClient.parsePdf(pdfPath, { ...options, outputDir }); + return { ...result, engine: 'mineru' }; + } catch (err) { + // Log degradation warning, then fall through to pdf-parse + const errMsg = err instanceof mineruClient.MineruError + ? `[MinerU ${err.code}] ${err.message}` + : `[MinerU Error] ${err.message}`; + process.stderr.write(`[PaperReader] MinerU failed, degrading to pdf-parse: ${errMsg}\n`); + } + } + + // Fallback to pdf-parse + const result = await fallback.parsePdf(pdfPath); + return { ...result, engine: 'pdf-parse' }; +} + +module.exports = { ingestPdf }; diff --git a/Plugin/PaperReader/lib/llm.js b/Plugin/PaperReader/lib/llm.js new file mode 100644 index 00000000..3bb0b745 --- /dev/null +++ b/Plugin/PaperReader/lib/llm.js @@ -0,0 +1,163 @@ +/** + * LLM 调用封装 (T4) + * + * 从 PaperReader.js 抽出,统一管理模型调用。 + */ + +const axios = require('axios'); +const path = require('path'); + +require('dotenv').config({ path: path.join(__dirname, '..', 'config.env') }); +require('dotenv').config({ path: path.join(__dirname, '..', '..', '..', 'config.env') }); + +const API_KEY = process.env.PaperReaderApiKey || process.env.Key || process.env.API_Key; +const RAW_API_URL = process.env.PaperReaderApiUrl || process.env.API_URL; +const VCP_PORT = process.env.PORT || '6005'; +const MODEL = process.env.PaperReaderModel; +const MAX_OUTPUT_TOKENS = parseInt(process.env.PaperReaderMaxOutputTokens || '12000', 10); + +function resolveApiUrl() { + let url = RAW_API_URL; + if (!url) return null; + + // If API_URL is just a base like http://127.0.0.1:3000, auto-fix to VCP port + path + // VCP serves its chat completions API on PORT (default 6005), not the admin panel port + if (url.match(/^https?:\/\/(?:127\.0\.0\.1|localhost)(?::\d+)?$/)) { + const base = url.replace(/:\d+$/, ''); + url = `${base}:${VCP_PORT}/v1/chat/completions`; + } + + // Append /v1/chat/completions if URL doesn't already end with a path + if (!url.includes('/v1/') && !url.includes('/chat/')) { + url = url.replace(/\/$/, '') + '/v1/chat/completions'; + } + + return url; +} + +const API_URL = resolveApiUrl(); + +function ensureConfig() { + if (!API_KEY || !API_URL) { + throw new Error( + `Missing API config: API_Key=${API_KEY ? 'set' : 'MISSING'}, API_URL=${API_URL || 'MISSING'} (raw=${RAW_API_URL || 'MISSING'}). ` + + 'Check repo root config.env and Plugin/PaperReader/config.env.' + ); + } + if (!MODEL) { + throw new Error('Missing PaperReaderModel in config.env'); + } +} + +function classifyLlmError(err) { + const status = err?.response?.status; + const code = err?.code; + + if (status === 429) { + return { + type: 'rate_limit', + message: 'LLM API 触发速率限制(429)。建议降低并发/增大 chunk 间隔后重试。' + }; + } + if (status === 401 || status === 403) { + return { + type: 'auth', + message: 'LLM API 鉴权失败(401/403)。请检查 API_Key 与权限。' + }; + } + if (code === 'ECONNABORTED') { + return { + type: 'timeout', + message: 'LLM API 请求超时(ECONNABORTED)。可提高超时或降低单次输入体积。' + }; + } + if (status >= 500 && status <= 599) { + return { + type: 'upstream_5xx', + message: `LLM API 上游服务错误(${status})。建议稍后重试。` + }; + } + if (code === 'ENOTFOUND' || code === 'ECONNREFUSED' || code === 'EAI_AGAIN') { + return { + type: 'network', + message: `LLM API 网络异常(${code})。请检查 API_URL 或网络连通性。` + }; + } + + return { + type: 'unknown', + message: `LLM API 未分类错误:${err?.message || 'unknown error'}` + }; +} + +/** + * 调用 LLM (OpenAI-compatible API) + * + * @param {Array<{role: string, content: string}>} messages + * @param {object} options - { max_tokens, temperature, traceTag } + * @returns {Promise} 模型输出文本 + */ +async function callLLM(messages, { max_tokens = MAX_OUTPUT_TOKENS, temperature = 0.2, traceTag = 'callLLM' } = {}) { + ensureConfig(); + + const payload = { + model: MODEL, + messages, + stream: false, + max_tokens, + temperature + }; + + const maxRetries = 5; + for (let attempt = 0; attempt < maxRetries; attempt++) { + try { + process.stderr.write(`[PaperReader][LLM][${traceTag}] request start: model=${MODEL}, attempt=${attempt + 1}/${maxRetries}, max_tokens=${max_tokens}\n`); + const resp = await axios.post(API_URL, payload, { + headers: { Authorization: `Bearer ${API_KEY}`, 'Content-Type': 'application/json' }, + timeout: 180000 + }); + process.stderr.write(`[PaperReader][LLM][${traceTag}] request success: attempt=${attempt + 1}/${maxRetries}\n`); + return resp?.data?.choices?.[0]?.message?.content || ''; + } catch (err) { + const status = err?.response?.status; + if (status === 429 && attempt < maxRetries - 1) { + // Exponential backoff: 3s, 6s, 12s, 24s + const delay = 3000 * Math.pow(2, attempt); + process.stderr.write(`[PaperReader][LLM][${traceTag}] 429 rate limit, retrying in ${delay / 1000}s (attempt ${attempt + 1}/${maxRetries})\n`); + await new Promise(r => setTimeout(r, delay)); + continue; + } + + const classified = classifyLlmError(err); + process.stderr.write( + `[PaperReader][LLM][${traceTag}] request failed: type=${classified.type}, status=${status || 'n/a'}, code=${err?.code || 'n/a'}, message=${err?.message || 'n/a'}\n` + ); + throw new Error(`${classified.message} [status=${status || 'n/a'} code=${err?.code || 'n/a'}]`); + } + } +} + +/** + * 调用 LLM 并解析 JSON 响应 + * + * @param {Array} messages + * @param {object} options + * @returns {Promise} 解析后的 JSON 对象 + */ +async function callLLMJson(messages, options = {}) { + const raw = await callLLM(messages, { + ...options, + temperature: options.temperature ?? 0.1, + traceTag: options.traceTag || 'callLLMJson' + }); + try { + // 尝试从 markdown 代码块中提取 JSON + const jsonMatch = raw.match(/```(?:json)?\s*([\s\S]*?)```/); + const jsonStr = jsonMatch ? jsonMatch[1].trim() : raw.trim(); + return JSON.parse(jsonStr); + } catch { + return { raw_response: raw }; + } +} + +module.exports = { callLLM, callLLMJson }; diff --git a/Plugin/PaperReader/lib/mineru-client.js b/Plugin/PaperReader/lib/mineru-client.js new file mode 100644 index 00000000..56ba0183 --- /dev/null +++ b/Plugin/PaperReader/lib/mineru-client.js @@ -0,0 +1,245 @@ +/** + * MinerU Cloud API 适配器 (T1) + * + * 流程: 获取上传URL → PUT上传PDF → 轮询batch结果 → 下载zip → 提取md+figures + */ + +const fs = require('fs').promises; +const path = require('path'); +const axios = require('axios'); + +const MINERU_API_BASE = 'https://mineru.net/api/v4'; + +class MineruError extends Error { + constructor(code, message) { + super(message); + this.name = 'MineruError'; + this.code = code; + } +} + +/** + * 获取预签名上传URL + */ +async function getUploadUrl(token, fileName, modelVersion) { + const resp = await axios.post(`${MINERU_API_BASE}/file-urls/batch`, { + files: [{ name: fileName, data_id: `pr_${Date.now()}` }], + enable_formula: true, + enable_table: true, + model_version: modelVersion + }, { + headers: { + 'Content-Type': 'application/json', + 'Authorization': `Bearer ${token}` + }, + timeout: 30000 + }); + + const data = resp.data; + if (data.code !== 0) { + throw new MineruError('MINERU_AUTH_FAILED', `MinerU API error: code=${data.code}, msg=${data.msg || ''}`); + } + + return { + uploadUrl: data.data.file_urls[0], + batchId: data.data.batch_id + }; +} + +/** + * PUT 上传文件到预签名URL + */ +async function uploadFile(uploadUrl, filePath) { + const fileBuffer = await fs.readFile(filePath); + // MinerU 文档明确说明:上传文件时无须设置 Content-Type 请求头 + // axios 会自动添加 Content-Type/Accept 等头部,导致 OSS 预签名 URL 签名校验失败 + // 改用 Node 原生 https 模块,只发送 Content-Length,完全匹配 Python requests.put(url, data=f) 的行为 + const { URL } = require('url'); + const https = require('https'); + const parsedUrl = new URL(uploadUrl); + + await new Promise((resolve, reject) => { + const req = https.request({ + hostname: parsedUrl.hostname, + port: parsedUrl.port || 443, + path: parsedUrl.pathname + parsedUrl.search, + method: 'PUT', + headers: { + 'Content-Length': fileBuffer.length + }, + timeout: 120000 + }, (res) => { + let body = ''; + res.on('data', chunk => body += chunk); + res.on('end', () => { + if (res.statusCode >= 200 && res.statusCode < 300) { + resolve(); + } else { + reject(new MineruError('MINERU_UPLOAD_FAILED', + `Upload failed: HTTP ${res.statusCode} - ${body.slice(0, 200)}`)); + } + }); + }); + req.on('error', reject); + req.on('timeout', () => { req.destroy(); reject(new MineruError('MINERU_UPLOAD_FAILED', 'Upload timeout')); }); + req.write(fileBuffer); + req.end(); + }); +} + +/** + * 轮询batch结果 + */ +async function pollBatchResult(token, batchId, { timeout = 300000, pollInterval = 5000 } = {}) { + const startTime = Date.now(); + const url = `${MINERU_API_BASE}/extract-results/batch/${batchId}`; + + while (Date.now() - startTime < timeout) { + const resp = await axios.get(url, { + headers: { 'Authorization': `Bearer ${token}` }, + timeout: 15000 + }); + + const data = resp.data; + if (data.code !== 0) { + throw new MineruError('MINERU_PARSE_FAILED', `Batch poll failed: code=${data.code}`); + } + + const results = data.data?.extract_result || []; + if (results.length > 0) { + const first = results[0]; + if (first.state === 'done') { + return first; + } + if (first.state === 'failed') { + throw new MineruError('MINERU_PARSE_FAILED', `Batch task failed: ${first.err_msg || 'unknown'}`); + } + } + + await new Promise(r => setTimeout(r, pollInterval)); + } + + throw new MineruError('MINERU_TIMEOUT', `Batch polling timeout after ${timeout}ms`); +} + +/** + * 下载并解压结果zip,提取markdown和图片 + */ +async function downloadAndExtract(zipUrl, outputDir) { + const AdmZip = require('adm-zip'); + + const resp = await axios.get(zipUrl, { + responseType: 'arraybuffer', + timeout: 120000 + }); + + const zip = new AdmZip(resp.data); + const entries = zip.getEntries(); + + let markdown = ''; + const figures = []; + + const figuresDir = path.join(outputDir, 'assets', 'figures'); + await fs.mkdir(figuresDir, { recursive: true }); + + for (const entry of entries) { + const entryName = entry.entryName; + + if (entryName.endsWith('.md') && !entry.isDirectory) { + markdown = entry.getData().toString('utf-8'); + } else if (/\.(png|jpg|jpeg|gif|svg|webp)$/i.test(entryName) && !entry.isDirectory) { + const figName = path.basename(entryName); + const figPath = path.join(figuresDir, figName); + await fs.writeFile(figPath, entry.getData()); + figures.push({ + id: figName.replace(/\.[^.]+$/, ''), + path: `assets/figures/${figName}`, + filename: figName + }); + } + } + + return { markdown, figures }; +} + +/** + * 从markdown中提取figure caption映射 + */ +function extractFigureCaptions(markdown) { + const captions = []; + // 匹配 ![caption](path) 模式 + const imgRegex = /!\[([^\]]*)\]\(([^)]+)\)/g; + let match; + while ((match = imgRegex.exec(markdown)) !== null) { + captions.push({ + caption: match[1], + originalPath: match[2], + id: path.basename(match[2]).replace(/\.[^.]+$/, '') + }); + } + + // 匹配 "Figure X." 或 "Fig. X:" 开头的段落 + const figTextRegex = /^(Fig(?:ure)?\.?\s*\d+[.:]\s*)(.+)$/gm; + while ((match = figTextRegex.exec(markdown)) !== null) { + captions.push({ + caption: match[2].trim(), + label: match[1].trim(), + id: `fig_text_${captions.length}` + }); + } + + return captions; +} + +/** + * 完整流程:上传 PDF → 提交解析 → 轮询 → 返回结果 + * + * @param {string} pdfPath - PDF 绝对路径 + * @param {object} options - { token, timeout, pollInterval, outputDir, modelVersion } + * @returns {Promise<{ markdown: string, figures: Array, pageCount: number, figureMap: Array }>} + */ +async function parsePdf(pdfPath, options = {}) { + const token = options.token || process.env.MINERU_API_TOKEN; + if (!token) { + throw new MineruError('MINERU_AUTH_FAILED', 'MINERU_API_TOKEN is required'); + } + + const timeout = options.timeout || parseInt(process.env.MINERU_API_TIMEOUT || '300000', 10); + const pollInterval = options.pollInterval || parseInt(process.env.MINERU_POLL_INTERVAL || '5000', 10); + const modelVersion = options.modelVersion || process.env.MINERU_MODEL_VERSION || 'pipeline'; + + const fileName = path.basename(pdfPath); + const outputDir = options.outputDir || path.dirname(pdfPath); + + // Step 1: 获取上传URL + const { uploadUrl, batchId } = await getUploadUrl(token, fileName, modelVersion); + + // Step 2: 上传文件 + await uploadFile(uploadUrl, pdfPath); + + // Step 3: 轮询batch结果 (file-urls/batch 自动创建解析任务) + const batchResult = await pollBatchResult(token, batchId, { timeout, pollInterval }); + + // Step 4: 下载并解压结果 + const zipUrl = batchResult.full_zip_url; + if (!zipUrl) { + throw new MineruError('MINERU_PARSE_FAILED', 'No zip URL in result'); + } + + const { markdown, figures } = await downloadAndExtract(zipUrl, outputDir); + + // Step 5: 提取figure captions + const figureMap = extractFigureCaptions(markdown); + + return { + markdown, + figures, + pageCount: batchResult.page_count || null, + figureMap + }; +} + +module.exports = { + parsePdf, + MineruError +}; diff --git a/Plugin/PaperReader/lib/pdf-parse-fallback.js b/Plugin/PaperReader/lib/pdf-parse-fallback.js new file mode 100644 index 00000000..006f96b7 --- /dev/null +++ b/Plugin/PaperReader/lib/pdf-parse-fallback.js @@ -0,0 +1,51 @@ +/** + * pdf-parse 降级回退封装 (T2) + * + * 当 MinerU API 不可用时,回退到本地 pdf-parse 纯文本抽取。 + * 输出格式与 mineru-client.js 对齐,但 figures 为空,markdown 为纯文本。 + * + * pdf-parse v2 API: new PDFParse({ data: Uint8Array }) → getText() → destroy() + */ + +const fs = require('fs').promises; +const { PDFParse } = require('pdf-parse'); + +/** + * 使用 pdf-parse 做纯文本抽取(降级模式) + * + * @param {string} pdfPath - PDF 绝对路径 + * @returns {Promise<{ markdown: string, figures: [], pageCount: number, figureMap: [], degraded: true }>} + */ +async function parsePdf(pdfPath) { + const buffer = await fs.readFile(pdfPath); + // pdf-parse v2 要求 Uint8Array 而非 Buffer + const uint8 = new Uint8Array(buffer.buffer, buffer.byteOffset, buffer.byteLength); + + const parser = new PDFParse({ data: uint8 }); + let pageCount = null; + let rawText = ''; + + try { + const info = await parser.getInfo(); + pageCount = info.total || null; + + const textResult = await parser.getText(); + rawText = textResult.text || ''; + } finally { + await parser.destroy(); + } + + const markdown = rawText + .replace(/\r\n/g, '\n') + .replace(/\n{3,}/g, '\n\n'); + + return { + markdown, + figures: [], + pageCount, + figureMap: [], + degraded: true + }; +} + +module.exports = { parsePdf }; diff --git a/Plugin/PaperReader/lib/query.js b/Plugin/PaperReader/lib/query.js new file mode 100644 index 00000000..16b21762 --- /dev/null +++ b/Plugin/PaperReader/lib/query.js @@ -0,0 +1,108 @@ +/** + * Query 问答模块 (T7) + * + * Phase 1: 关键词匹配挑选相关 chunk + LLM 问答 + * Phase 2: 升级为向量检索 + */ + +const fs = require('fs').promises; +const fsSync = require('fs'); +const path = require('path'); +const { callLLM } = require('./llm'); + +const WORKSPACE_ROOT = path.join(__dirname, '..', 'workspace'); + +/** + * 关键词匹配挑选相关 chunk + */ +function keywordPick(chunks, question, topK = 6) { + const q = String(question || '').toLowerCase().trim(); + if (!q) return chunks.slice(0, topK); + + const words = q.split(/[\s,;,;。?!?!]+/).filter(w => w.length >= 2).slice(0, 15); + + const scored = chunks.map(c => { + const text = (c.text || '').toLowerCase(); + const section = (c.section || '').toLowerCase(); + let score = 0; + for (const w of words) { + if (text.includes(w)) score += 1; + if (section.includes(w)) score += 2; + } + return { chunk: c, score }; + }); + + scored.sort((a, b) => b.score - a.score); + return scored.slice(0, topK).filter(s => s.score > 0).map(s => s.chunk); +} + +/** + * 对已导入的文档做检索式问答 + * + * @param {string} paperId + * @param {string} question + * @returns {Promise<{ paperId, answer, sources: Array }>} + */ +async function queryPaper(paperId, question) { + if (!paperId) throw new Error('Query requires paperId'); + if (!question) throw new Error('Query requires question'); + + const wsDir = path.join(WORKSPACE_ROOT, paperId); + const manifestPath = path.join(wsDir, 'chunks', 'manifest.json'); + + if (!fsSync.existsSync(manifestPath)) { + throw new Error(`chunks/manifest.json not found: ${manifestPath}`); + } + + const manifest = JSON.parse(await fs.readFile(manifestPath, 'utf-8')); + const chunks = manifest.chunks || []; + + // Load Global Map if exists + const globalMapPath = path.join(wsDir, 'reading_notes', 'Global_Map.md'); + const globalMap = fsSync.existsSync(globalMapPath) + ? await fs.readFile(globalMapPath, 'utf-8') + : ''; + + // Pick relevant chunks + const picked = keywordPick(chunks, question, 6); + const contextChunks = picked.length > 0 ? picked : chunks.slice(0, 4); + + // Read chunk files for full content + const contextParts = []; + for (const c of contextChunks) { + const chunkPath = path.join(wsDir, 'chunks', `chunk_${c.index}.md`); + let text; + if (fsSync.existsSync(chunkPath)) { + text = await fs.readFile(chunkPath, 'utf-8'); + } else { + text = c.text || ''; + } + contextParts.push(`---\n[chunk ${c.index} | 章节: ${c.section || 'unknown'}]\n${text}`); + } + const context = contextParts.join('\n\n'); + + const system = [ + '你是一个"文档问答助手",适用于各类长文档(学术论文、技术报告、书籍、法律文书等)。', + '只根据提供的上下文回答;若上下文不足,明确说"证据不足",并给出下一步需要检索的章节/关键词。', + '输出:先给结论,再给证据引用(标注 chunk index 和章节名)。' + ].join('\n'); + + const user = [ + globalMap ? `全局地图:\n${globalMap.slice(0, 2000)}` : '', + `问题:${question}`, + `上下文:\n${context}` + ].filter(Boolean).join('\n\n'); + + const answer = await callLLM([ + { role: 'system', content: system }, + { role: 'user', content: user } + ], { temperature: 0.2 }); + + return { + paperId, + answer, + sources: contextChunks.map(c => ({ index: c.index, section: c.section })) + }; +} + +module.exports = { queryPaper }; diff --git a/Plugin/PaperReader/lib/reading-state.js b/Plugin/PaperReader/lib/reading-state.js new file mode 100644 index 00000000..1d170cc6 --- /dev/null +++ b/Plugin/PaperReader/lib/reading-state.js @@ -0,0 +1,137 @@ +/** + * ReadingState 持久化管理 (v0.4) + * + * 管理 reading_state.json 的读写,支持: + * - 中断恢复 + * - 多轮阅读 + * - 跨会话接力 + */ + +const fs = require('fs').promises; +const fsSync = require('fs'); +const path = require('path'); + +const WORKSPACE_ROOT = path.join(__dirname, '..', 'workspace'); + +/** + * 创建空的 ReadingState + */ +function createEmptyState(docId, goal, mode) { + return { + docId, + goal: goal || '', + mode: mode || 'auto', + currentPhase: 'survey', + round: 1, + rollingContext: '', + readLog: [], + chunkSummaries: [], + auditReport: null, + createdAt: new Date().toISOString(), + updatedAt: new Date().toISOString() + }; +} + +/** + * 获取 reading_state.json 路径 + */ +function getStatePath(docId) { + return path.join(WORKSPACE_ROOT, docId, 'reading_notes', 'reading_state.json'); +} + +/** + * 加载 ReadingState(不存在则返回 null) + */ +async function loadState(docId) { + const statePath = getStatePath(docId); + if (!fsSync.existsSync(statePath)) return null; + try { + const raw = await fs.readFile(statePath, 'utf-8'); + return JSON.parse(raw); + } catch { + return null; + } +} + +/** + * 保存 ReadingState + */ +async function saveState(docId, state) { + const statePath = getStatePath(docId); + const dir = path.dirname(statePath); + await fs.mkdir(dir, { recursive: true }); + state.updatedAt = new Date().toISOString(); + await fs.writeFile(statePath, JSON.stringify(state, null, 2), 'utf-8'); +} + +/** + * 加载或创建 ReadingState + */ +async function loadOrCreateState(docId, goal, mode) { + const existing = await loadState(docId); + if (existing) { + // 如果 goal 不同,创建新的 round + if (goal && existing.goal !== goal) { + existing.round = (existing.round || 1) + 1; + existing.goal = goal; + existing.currentPhase = 'survey'; + existing.auditReport = null; + process.stderr.write(`[PaperReader][State] new round ${existing.round} with different goal\n`); + } + return existing; + } + return createEmptyState(docId, goal, mode); +} + +/** + * 记录一个 chunk 的阅读结果 + */ +function addChunkRead(state, { chunkIndex, section, readMode, nodeId }) { + state.readLog.push({ + chunkIndex, + section: section || 'unknown', + readMode, + nodeId: nodeId || null, + readAt: new Date().toISOString(), + round: state.round + }); +} + +/** + * 添加 chunk 摘要 + */ +function addChunkSummary(state, summary) { + // 去重:同 chunkIndex 只保留最新 + state.chunkSummaries = state.chunkSummaries.filter( + s => s.chunkIndex !== summary.chunkIndex + ); + state.chunkSummaries.push(summary); +} + +/** + * 更新阶段 + */ +function setPhase(state, phase) { + state.currentPhase = phase; +} + +/** + * 获取已读 chunk 索引集合(指定 round 或全部) + */ +function getReadChunkIndices(state, round) { + const log = round + ? state.readLog.filter(r => r.round === round) + : state.readLog; + return new Set(log.map(r => r.chunkIndex)); +} + +module.exports = { + createEmptyState, + loadState, + saveState, + loadOrCreateState, + addChunkRead, + addChunkSummary, + setPhase, + getReadChunkIndices +}; diff --git a/Plugin/PaperReader/lib/skeleton.js b/Plugin/PaperReader/lib/skeleton.js new file mode 100644 index 00000000..dd200cf3 --- /dev/null +++ b/Plugin/PaperReader/lib/skeleton.js @@ -0,0 +1,161 @@ +/** + * Skeleton 骨架提取重构 (T5) + * + * 从 Markdown 结构提取目录树、Abstract、Conclusion、Figure Caption, + * 生成 Global Map。不再只读首尾2块。 + */ + +const fs = require('fs').promises; +const fsSync = require('fs'); +const path = require('path'); +const { callLLM } = require('./llm'); +const { extractSections } = require('./chunker'); + +const WORKSPACE_ROOT = path.join(__dirname, '..', 'workspace'); + +/** + * 从 Markdown 提取目录树(标题列表) + */ +function extractTOC(markdown) { + const lines = markdown.split('\n'); + const toc = []; + for (const line of lines) { + const match = line.match(/^(#{1,4})\s+(.+)$/); + if (match) { + toc.push({ + level: match[1].length, + title: match[2].trim(), + indent: ' '.repeat(match[1].length - 1) + }); + } + } + return toc; +} + +/** + * 提取关键章节全文 + */ +function extractKeySections(sections) { + const keyPatterns = [ + /abstract/i, + /introduction/i, + /conclusion/i, + /discussion/i, + /summary/i, + /overview/i, + /background/i, + /preface/i, + /executive.?summary/i, + /摘要/, + /引言/, + /结论/, + /讨论/, + /概述/, + /背景/, + /前言/, + /总结/ + ]; + + const found = []; + for (const section of sections) { + for (const pattern of keyPatterns) { + if (pattern.test(section.title)) { + found.push(section); + break; + } + } + } + return found; +} + +/** + * 从 figure_map.json 加载 Figure Captions + */ +async function loadFigureCaptions(wsDir) { + const figMapPath = path.join(wsDir, 'figure_map.json'); + if (!fsSync.existsSync(figMapPath)) return []; + const raw = await fs.readFile(figMapPath, 'utf-8'); + try { + return JSON.parse(raw); + } catch { + return []; + } +} + +/** + * 从 Markdown 结构提取骨架并生成 Global Map + * + * @param {string} paperId + * @param {object} options - { focus } + * @returns {Promise<{ globalMapPath: string, globalMapContent: string }>} + */ +async function generateSkeleton(paperId, options = {}) { + const wsDir = path.join(WORKSPACE_ROOT, paperId); + const mdPath = path.join(wsDir, 'full_text.md'); + const metaPath = path.join(wsDir, 'meta.json'); + + if (!fsSync.existsSync(mdPath)) { + throw new Error(`full_text.md not found: ${mdPath}`); + } + + const markdown = await fs.readFile(mdPath, 'utf-8'); + const meta = fsSync.existsSync(metaPath) + ? JSON.parse(await fs.readFile(metaPath, 'utf-8')) + : {}; + + // 1. 提取目录树 + const toc = extractTOC(markdown); + const tocText = toc.map(t => `${t.indent}- ${t.title}`).join('\n'); + + // 2. 提取关键章节 + const sections = extractSections(markdown); + const keySections = extractKeySections(sections); + const keyText = keySections + .map(s => `### ${s.title}\n${s.content.slice(0, 3000)}`) + .join('\n\n'); + + // 3. 加载 Figure Captions + const figureCaptions = await loadFigureCaptions(wsDir); + const captionText = figureCaptions.length > 0 + ? figureCaptions.map(f => `- ${f.label || f.id}: ${f.caption}`).join('\n') + : '(无图注信息)'; + + // 4. 构建 LLM prompt + const system = [ + '你是一个"文档骨架提取器",适用于各类长文档(学术论文、技术报告、书籍章节、法律文书等)。', + '目标:基于目录结构、关键章节和图注,提取文档的全局地图(Global Map)。', + '输出 Markdown,根据文档类型自适应包含以下要素:', + '1. 核心主题(1-2句话概括本文档的核心内容)', + '2. 核心问题/目的(本文档要解决什么问题或传达什么信息)', + '3. 关键内容概要(主要论点、方法、流程、条款等——依文档类型而定)', + '4. 结构路线图(文档的组织逻辑和各部分之间的关系)', + '5. 主要结论/要点', + '6. 局限性/注意事项/风险点', + '7. 各章节阅读优先级标签(High/Medium/Low)', + '8. 后续深读建议(重点关注哪些章节/图表/附录)', + '引用原文短句时标注来自哪个章节。' + ].join('\n'); + + const user = [ + `阅读焦点:${options.focus || '通用理解(全面掌握文档核心内容与结构)'}`, + `元信息:页数=${meta.pageCount ?? 'unknown'}`, + `\n【目录结构】\n${tocText}`, + `\n【关键章节内容】\n${keyText.slice(0, 15000)}`, + `\n【图注列表】\n${captionText}` + ].join('\n\n'); + + const content = await callLLM([ + { role: 'system', content: system }, + { role: 'user', content: user } + ]); + + // 5. 写入 Global_Map.md + const notesDir = path.join(wsDir, 'reading_notes'); + await fs.mkdir(notesDir, { recursive: true }); + const outPath = path.join(notesDir, 'Global_Map.md'); + await fs.writeFile(outPath, content || '', 'utf-8'); + + return { globalMapPath: outPath, globalMapContent: content }; +} + +module.exports = { generateSkeleton, extractTOC, extractKeySections }; diff --git a/Plugin/PaperReader/lib/skim-reader.js b/Plugin/PaperReader/lib/skim-reader.js new file mode 100644 index 00000000..1f036910 --- /dev/null +++ b/Plugin/PaperReader/lib/skim-reader.js @@ -0,0 +1,52 @@ +/** + * Skim Reader 模块 (v0.4) + * + * 轻量摘要:用简化 prompt 处理 skim 标记的 chunk。 + * 核心约束:Skim 结果不写入 Rolling Context(不污染精读上下文)。 + * 支持 upgrade 检测:如果发现高密度信息,自动提升为 deep。 + */ + +const { callLLMJson } = require('./llm'); + +/** + * 对单个 chunk 执行 Skim 摘要 + * + * @param {string} chunkText - chunk 原文 + * @param {object} options - { goal, chunkIndex, section } + * @returns {Promise<{summary: string, upgrade: boolean, reason: string}>} + */ +async function skimChunk(chunkText, { goal, chunkIndex, section }) { + const system = [ + '你是一个快速扫读器。用一句话概括这个章节的核心内容。', + '如果发现与阅读目标高度相关的意外重要内容,标记 upgrade: true。', + '', + '输出 JSON(纯 JSON,不要代码块):', + '{"summary": string, "upgrade": boolean, "reason": string}', + '', + 'upgrade 规则:', + '- true:该 chunk 包含与阅读目标直接相关的关键数据/方法/结论,值得精读', + '- false:该 chunk 是背景/综述/已知信息,扫读即可', + 'reason:解释为什么 upgrade 或不 upgrade(一句话)' + ].join('\n'); + + const user = [ + `阅读目标:${goal || '全面理解文档核心内容'}`, + `当前位置:第 ${chunkIndex} 块,章节「${section}」`, + '', + `【chunk 内容】`, + chunkText + ].join('\n'); + + const result = await callLLMJson([ + { role: 'system', content: system }, + { role: 'user', content: user } + ], { temperature: 0.1, max_tokens: 500, traceTag: `Skim:chunk_${chunkIndex}` }); + + return { + summary: result.summary || result.raw_response || '', + upgrade: result.upgrade === true, + reason: result.reason || '' + }; +} + +module.exports = { skimChunk }; diff --git a/Plugin/PaperReader/plugin-manifest.json b/Plugin/PaperReader/plugin-manifest.json new file mode 100644 index 00000000..25fccc2d --- /dev/null +++ b/Plugin/PaperReader/plugin-manifest.json @@ -0,0 +1,87 @@ +{ + "manifestVersion": "1.0.0", + "name": "PaperReader", + "version": "0.4.0", + "displayName": "超文本递归阅读器", + "description": "统一自适应阅读引擎:将超长 PDF/文档转为目标驱动的多分辨率阅读流程。v0.4: 统一 Read 命令(Survey→Triage→DeepDive/Skim→Audit→Synthesize)、Triage 分诊注意力分配、Skim 轻量扫读、Auditor 去偏见审核、ReadingState 持久化。MinerU 云端高保真解析,不可用时自动降级到 pdf-parse。", + "author": "VCP", + "pluginType": "synchronous", + "entryPoint": { + "type": "nodejs", + "command": "node PaperReader.js" + }, + "communication": { + "protocol": "stdio", + "timeout": 1800000 + }, + "configSchema": { + "MINERU_API_TOKEN": { + "type": "string", + "description": "MinerU 云端 API Token(从 mineru.net 获取)。不填则自动降级到 pdf-parse。" + }, + "MINERU_API_TIMEOUT": { + "type": "integer", + "description": "MinerU 轮询超时(ms),默认 300000(5分钟)。" + }, + "MINERU_POLL_INTERVAL": { + "type": "integer", + "description": "MinerU 轮询间隔(ms),默认 5000。" + }, + "PaperReaderChunkSize": { + "type": "integer", + "description": "目标 chunk 大小(tokens),默认 2000。" + }, + "PaperReaderOverlap": { + "type": "number", + "description": "chunk 重叠比例,默认 0.15。" + }, + "PaperReaderModel": { + "type": "string", + "description": "用于阅读/总结的模型名称。" + }, + "PaperReaderMaxOutputTokens": { + "type": "integer", + "description": "单次模型输出上限,默认 12000。" + }, + "PaperReaderBatchSize": { + "type": "integer", + "description": "Deep 阅读分批处理的 chunk 数,默认 5。同批内共享 Rolling Context 快照,越大速度越快但上下文递进越弱。" + }, + "PaperReaderMaxConcurrentLLM": { + "type": "integer", + "description": "进程级 LLM 最大并发请求数(semaphore),默认 5。真正的并发控制。" + }, + "PaperReaderMaxChunks": { + "type": "integer", + "description": "单次阅读最多处理的 chunk 数(防止成本失控),默认 120。" + }, + "PaperReaderMaxAuditChunks": { + "type": "integer", + "description": "Auditor 审核抽样最大 chunk 数,默认 8。" + } + }, + "capabilities": { + "invocationCommands": [ + { + "commandIdentifier": "IngestPDF", + "description": "解析 PDF 并生成可递归阅读的分块工件。支持学术论文、技术报告、书籍、法律文书等各类 PDF。使用 MinerU 云端 API 高保真解析(保留公式/表格/图片/多栏排版),失败自动降级到 pdf-parse。输出章节感知的 chunk 文件。\n参数:\n- command: 固定为 IngestPDF\n- filePath (字符串, 必需): PDF 绝对路径\n- paperId (字符串, 可选): 自定义 ID(不传则自动生成)\n调用格式:\n<<<[TOOL_REQUEST]>>>\ntool_name:「始」PaperReader「末」,\ncommand:「始」IngestPDF「末」,\nfilePath:「始」D:\\\\books\\\\paper.pdf「末」\n<<<[END_TOOL_REQUEST]>>>" + }, + { + "commandIdentifier": "Read", + "description": "v0.4 统一自适应阅读命令。自动执行完整流程:Survey(骨架提取)→ Triage(分诊注意力分配)→ DeepDive/Skim(精读/扫读)→ Audit(去偏见审核)→ Synthesize(合成报告)。这是推荐的阅读方式,自动决定哪些章节精读、哪些扫读、哪些跳过。注意:处理大文档(100+ chunks)可能需要数分钟。\n参数:\n- command: 固定为 Read\n- paperId (字符串, 必需): 文档 ID\n- goal (字符串, 可选): 阅读目标(影响 Triage 分诊策略)\n- forceReread (布尔值, 可选): 强制重新阅读(忽略缓存)\n调用格式:\n<<<[TOOL_REQUEST]>>>\ntool_name:「始」PaperReader「末」,\ncommand:「始」Read「末」,\npaperId:「始」xray-20260208-xxx「末」,\ngoal:「始」理解核心方法和实验设计「末」\n<<<[END_TOOL_REQUEST]>>>" + }, + { + "commandIdentifier": "ReadSkeleton", + "description": "基于已导入的文档工件生成骨架地图(Global Map)+ 层级树索引。Read 命令会自动调用此步骤,通常不需要单独使用。\n参数:\n- command: 固定为 ReadSkeleton\n- paperId (字符串, 必需)\n- focus (字符串, 可选): 本次阅读关注点\n调用格式:\n<<<[TOOL_REQUEST]>>>\ntool_name:「始」PaperReader「末」,\ncommand:「始」ReadSkeleton「末」,\npaperId:「始」xray-xxx「末」,\nfocus:「始」方法学与实验设计「末」\n<<<[END_TOOL_REQUEST]>>>" + }, + { + "commandIdentifier": "ReadDeep", + "description": "全量 Rolling Context 深度阅读(无 Triage/Audit 的 v0.3 兼容模式)。对所有 chunk 无差别精读。推荐使用 Read 命令替代。\n参数:\n- command: 固定为 ReadDeep\n- paperId (字符串, 必需)\n- goal (字符串, 可选): 主任务目标\n调用格式:\n<<<[TOOL_REQUEST]>>>\ntool_name:「始」PaperReader「末」,\ncommand:「始」ReadDeep「末」,\npaperId:「始」xray-xxx「末」,\ngoal:「始」快速理解核心贡献「末」\n<<<[END_TOOL_REQUEST]>>>" + }, + { + "commandIdentifier": "Query", + "description": "对已导入的文档做检索式问答(树索引推理式检索,降级到关键词匹配)。返回答案、推理过程及引用来源。\n参数:\n- command: 固定为 Query\n- paperId (字符串, 必需)\n- question (字符串, 必需)\n调用格式:\n<<<[TOOL_REQUEST]>>>\ntool_name:「始」PaperReader「末」,\ncommand:「始」Query「末」,\npaperId:「始」xray-xxx「末」,\nquestion:「始」这份文档的核心结论是什么?「末」\n<<<[END_TOOL_REQUEST]>>>" + } + ] + } +}