lioensky · Andyduck-ops · Feb 7, 2026 · Feb 7, 2026 · Feb 7, 2026 · Feb 7, 2026
diff --git a/Plugin/PaperReader/PaperReader.js b/Plugin/PaperReader/PaperReader.js
@@ -0,0 +1,211 @@
+/**
+ * PaperReader v0.2 — 主入口
+ * 
+ * stdin 接收 JSON → 路由到各 command handler → stdout 输出 JSON
+ */
+
+const fs = require('fs').promises;
+const fsSync = require('fs');
+const path = require('path');
+const crypto = require('crypto');
+
+require('dotenv').config({ path: path.join(__dirname, 'config.env') });
+require('dotenv').config({ path: path.join(__dirname, '..', '..', 'config.env') });
+
+const { ingestPdf } = require('./lib/ingest');
+const { chunkMarkdown } = require('./lib/chunker');
+const { generateSkeleton } = require('./lib/skeleton');
+const { readDeep } = require('./lib/deep-reader');
+const { queryPaper } = require('./lib/query');
+
+const WORKSPACE_ROOT = path.join(__dirname, 'workspace');
+
+function sendResponse(data) {
+  process.stdout.write(JSON.stringify(data));
+  process.exit(0);
+}
+
+function sha1(input) {
+  return crypto.createHash('sha1').update(input).digest('hex');
+}
+
+function getPaperWorkspace(paperId) {
+  return path.join(WORKSPACE_ROOT, paperId);
+}
+
+async function writeJson(filePath, obj) {
+  await fs.writeFile(filePath, JSON.stringify(obj, null, 2), 'utf-8');
+}
+
+// ─── Command Handlers ───
+
+async function handleIngestPDF({ filePath, paperId, forceReparse }) {
+  if (!filePath || typeof filePath !== 'string') {
+    throw new Error('IngestPDF requires filePath');
+  }
+
+  const abs = path.isAbsolute(filePath) ? filePath : path.resolve(process.cwd(), filePath);
+  if (!fsSync.existsSync(abs)) {
+    throw new Error(`PDF not found: ${abs}`);
+  }
+
+  const resolvedPaperId = paperId && String(paperId).trim()
+    ? String(paperId).trim()
+    : `paper-${sha1(abs).slice(0, 10)}`;
+
+  const wsDir = getPaperWorkspace(resolvedPaperId);
+  const manifestPath = path.join(wsDir, 'chunks', 'manifest.json');
+  const metaPath = path.join(wsDir, 'meta.json');
+
+  // ── Cache check: if manifest + meta already exist, skip re-parsing ──
+  if (!forceReparse && fsSync.existsSync(manifestPath) && fsSync.existsSync(metaPath)) {
+    const existingMeta = JSON.parse(await fs.readFile(metaPath, 'utf-8'));
+    const existingManifest = JSON.parse(await fs.readFile(manifestPath, 'utf-8'));
+    process.stderr.write(`[PaperReader][Ingest] cache hit: paperId=${resolvedPaperId}, chunkCount=${existingManifest.chunkCount}, engine=${existingMeta.engine}\n`);
+    return {
+      paperId: resolvedPaperId,
+      workspace: wsDir,
+      pageCount: existingMeta.pageCount,
+      chunkCount: existingManifest.chunkCount,
+      engine: existingMeta.engine,
+      cached: true
+    };
+  }
+
+  process.stderr.write(`[PaperReader][Ingest] no cache, starting full parse: paperId=${resolvedPaperId}\n`);
+
+  await fs.mkdir(wsDir, { recursive: true });
+
+  // L0: 解析 PDF → Markdown + Figures
+  const parsed = await ingestPdf(abs, { outputDir: wsDir });
+
+  // Save meta
+  const meta = {
+    paperId: resolvedPaperId,
+    sourceFilePath: abs,
+    extractedAt: new Date().toISOString(),
+    pageCount: parsed.pageCount,
+    textLength: (parsed.markdown || '').length,
+    engine: parsed.engine
+  };
+  await writeJson(metaPath, meta);
+
+  // Save full markdown
+  await fs.writeFile(path.join(wsDir, 'full_text.md'), parsed.markdown || '', 'utf-8');
+
+  // Save figure map
+  if (parsed.figureMap && parsed.figureMap.length > 0) {
+    await writeJson(path.join(wsDir, 'figure_map.json'), parsed.figureMap);
+  }
+
+  // L1: 章节感知切分
+  const chunks = chunkMarkdown(parsed.markdown || '');
+
+  // Save chunks
+  const chunksDir = path.join(wsDir, 'chunks');
+  await fs.mkdir(chunksDir, { recursive: true });
+
+  for (const chunk of chunks) {
+    const chunkContent = chunk.metaHeader
+      ? `${chunk.metaHeader}\n\n---\n\n${chunk.text}`
+      : chunk.text;
+    await fs.writeFile(
+      path.join(chunksDir, `chunk_${chunk.index}.md`),
+      chunkContent,
+      'utf-8'
+    );
+  }
+
+  // Save manifest
+  const manifest = {
+    chunkCount: chunks.length,
+    chunks: chunks.map(c => ({
+      index: c.index,
+      section: c.section,
+      tokenCount: c.tokenCount
+    }))
+  };
+  await writeJson(manifestPath, manifest);
+
+  // Create reading_notes dir
+  await fs.mkdir(path.join(wsDir, 'reading_notes'), { recursive: true });
+
+  return {
+    paperId: resolvedPaperId,
+    workspace: wsDir,
+    pageCount: meta.pageCount,
+    chunkCount: chunks.length,
+    engine: parsed.engine,
+    cached: false
+  };
+}
+
+async function handleReadSkeleton({ paperId, focus }) {
+  if (!paperId) throw new Error('ReadSkeleton requires paperId');
+  const result = await generateSkeleton(paperId, { focus });
+  return { paperId, globalMapPath: result.globalMapPath, content: result.globalMapContent };
+}
+
+async function handleReadDeep({ paperId, goal, maxChunks, batchSize, forceReread }) {
+  if (!paperId) throw new Error('ReadDeep requires paperId');
+  const opts = { goal };
+  if (maxChunks) opts.maxChunks = maxChunks;
+  if (batchSize) opts.batchSize = batchSize;
+  if (forceReread) opts.forceReread = true;
+  const result = await readDeep(paperId, opts);
+  // Read the Round_1_Summary.md to return its content
+  const summaryContent = fsSync.existsSync(result.roundPath)
+    ? (await fs.readFile(result.roundPath, 'utf-8'))
+    : '';
+  return { ...result, content: summaryContent };
+}
+
+async function handleQuery({ paperId, question }) {
+  return await queryPaper(paperId, question);
+}
+
+// ─── Main ───
+
+async function main() {
+  let inputData = '';
+  process.stdin.setEncoding('utf8');
+  for await (const chunk of process.stdin) inputData += chunk;
+
+  const request = JSON.parse(inputData || '{}');
+  const command = request.command;
+
+  process.stderr.write(`[PaperReader][Main] request received: command=${command || 'undefined'}, paperId=${request.paperId || 'n/a'}\n`);
+
+  try {
+    if (!command) throw new Error('Missing command');
+
+    let result;
+    switch (command) {
+      case 'IngestPDF':
+        process.stderr.write('[PaperReader][Main] route hit: IngestPDF\n');
+        result = await handleIngestPDF({ filePath: request.filePath, paperId: request.paperId, forceReparse: request.forceReparse });
+        break;
+      case 'ReadSkeleton':
+        process.stderr.write('[PaperReader][Main] route hit: ReadSkeleton\n');
+        result = await handleReadSkeleton({ paperId: request.paperId, focus: request.focus });
+        break;
+      case 'ReadDeep':
+        process.stderr.write('[PaperReader][Main] route hit: ReadDeep\n');
+        result = await handleReadDeep({ paperId: request.paperId, goal: request.goal, maxChunks: request.maxChunks, batchSize: request.batchSize, forceReread: request.forceReread });
+        break;
+      case 'Query':
+        process.stderr.write('[PaperReader][Main] route hit: Query\n');
+        result = await handleQuery({ paperId: request.paperId, question: request.question });
+        break;
+      default:
+        throw new Error(`Unknown command: ${command}`);
+    }
+
+    sendResponse({ status: 'success', result });
+  } catch (err) {
+    process.stderr.write(`[PaperReader][Main] request failed: command=${command || 'undefined'}, error=${err?.message || String(err)}\n`);
+    sendResponse({ status: 'error', error: err?.message || String(err) });
+  }
+}
+
+main();
diff --git a/Plugin/PaperReader/README.md b/Plugin/PaperReader/README.md
@@ -0,0 +1,68 @@
+# PaperReader（v0.2）
+
+## 设计目标
+
+将超长 PDF / 文档转为可控的递归阅读流程。适用于学术论文、技术报告、法律文书、书籍章节等各类长文档。
+
+1. **L0 解析层**：MinerU 云端 API 高保真解析（保留公式/表格/图片/多栏排版），自动降级到 pdf-parse
+2. **L1 切分层**：章节感知切分 + Meta-Header 注入 + 10-20% overlap
+3. **L2 递归逻辑层**：Skeleton 骨架提取 / Rolling Context 深度阅读 / 合并综合
+4. **L3 存储交互层**：Obsidian 友好的 Markdown 目录结构
+
+## 命令
+
+| 命令 | 功能 |
+|------|------|
+| `IngestPDF` | PDF → Markdown → 章节感知 chunks |
+| `ReadSkeleton` | 从目录/摘要/关键章节生成 Global Map |
+| `ReadDeep` | 带 Rolling Context 的递归摘要 → Round-1 笔记 |
+| `Query` | 检索式问答（关键词匹配 + 章节权重） |
+
+## 工件目录
+
+```
+workspace/{paperId}/
+├── meta.json                    # 元数据（含解析引擎标识）
+├── full_text.md                 # 完整 Markdown（L0 输出）
+├── figure_map.json              # Figure_ID ↔ Caption 映射
+├── assets/
+│   └── figures/                 # 提取的图片
+├── chunks/
+│   ├── manifest.json            # chunk 清单 + 章节映射
+│   └── chunk_{i}.md             # 单个 chunk（含 Meta-Header）
+└── reading_notes/
+    ├── Global_Map.md            # 骨架地图
+    ├── Chunk_Summaries.json     # 分块摘要
+    └── Round_1_Summary.md       # 深度笔记
+```
+
+## 配置
+
+复制 `config.env.example` 为 `config.env` 并填入：
+- `MINERU_API_TOKEN`：MinerU 云端 API Token（不填则自动降级）
+- `PaperReaderModel`：LLM 模型名称
+- 详见 `config.env.example` 中的完整配置项
+
+## 依赖
+
+- `axios`：HTTP 请求
+- `pdf-parse`：降级模式 PDF 解析
+- `adm-zip`：解压 MinerU 返回的 zip
+- `@dqbd/tiktoken`：token 计数
+- `dotenv`：环境变量
+
+## 支持的文档类型
+
+MinerU 云端 API 支持解析：
+- 学术论文（多栏、公式、引用）
+- 技术报告 / 白皮书
+- 书籍章节
+- 法律文书 / 合同
+- 扫描版 PDF（内置 OCR）
+- 含复杂表格的文档
+
+## 常见限制
+
+- MinerU 免费额度：每日 2000 页，单文件 200MB/600 页
+- Rolling Context 上限 4000 tokens，超出自动压缩
+- Query 目前为关键词匹配（向量检索计划在 Phase 2）
diff --git a/Plugin/PaperReader/config.env.example b/Plugin/PaperReader/config.env.example
@@ -0,0 +1,37 @@
+# PaperReader 插件配置（示例）
+
+# === L0 解析层 ===
+# MinerU 云端 API Token（从 mineru.net 网站的「个人中心 → API密钥管理」获取）
+# 注意：这里填的是 Bearer Token（一串长字符串），不是 Access Key / Secret Key
+# 不填则自动降级到 pdf-parse 纯文本模式
+MINERU_API_TOKEN=
+# MinerU 模型版本：pipeline（默认，速度快）或 vlm（效果更好，速度较慢）
+MINERU_MODEL_VERSION=pipeline
+# 轮询超时（ms），默认 5 分钟
+MINERU_API_TIMEOUT=300000
+# 轮询间隔（ms），默认 5 秒
+MINERU_POLL_INTERVAL=5000
+
+# === L1 切分层 ===
+# 目标 chunk 大小（tokens）
+PaperReaderChunkSize=2000
+# chunk 重叠比例
+PaperReaderOverlap=0.15
+
+# === L2 递归逻辑层 ===
+# 读取/总结模型（使用 VCP 的 API_URL/API_Key 调用 /v1/chat/completions）
+PaperReaderModel=gemini-2.5-flash-search
+# 单次模型输出 token 上限
+PaperReaderMaxOutputTokens=12000
+# 分批并发组大小（每组处理的 chunk 数，建议 ≤ MaxConcurrentLLM）
+# ⚠️ 质量取舍：同批内的 deep chunk 共享同一份 Rolling Context 快照。
+#   BatchSize=1（串行）：上下文递进最强，chunk N 能看到 1..N-1 的所有发现
+#   BatchSize=5（推荐）：速度与质量的甜蜜点
+#   BatchSize=10+：速度最快，但同批 chunk 无法互相感知（skim 不受影响）
+#   极高精度需求（法律/财务逐条审计）建议 ≤3
+PaperReaderBatchSize=5
+# 进程级 LLM 最大并发请求数（防止 429 风暴，建议 3-8）
+# 真正的并发控制由此 semaphore 管理，BatchSize 只控制批内共享上下文的范围
+PaperReaderMaxConcurrentLLM=5
+# deep 阅读最多处理多少个 chunk（防止成本失控）
+PaperReaderMaxChunks=120