Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
211 changes: 211 additions & 0 deletions Plugin/PaperReader/PaperReader.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
/**
* PaperReader v0.2 — 主入口
*
* stdin 接收 JSON → 路由到各 command handler → stdout 输出 JSON
*/

const fs = require('fs').promises;
const fsSync = require('fs');
const path = require('path');
const crypto = require('crypto');

require('dotenv').config({ path: path.join(__dirname, 'config.env') });
require('dotenv').config({ path: path.join(__dirname, '..', '..', 'config.env') });

const { ingestPdf } = require('./lib/ingest');
const { chunkMarkdown } = require('./lib/chunker');
const { generateSkeleton } = require('./lib/skeleton');
const { readDeep } = require('./lib/deep-reader');
const { queryPaper } = require('./lib/query');

const WORKSPACE_ROOT = path.join(__dirname, 'workspace');

function sendResponse(data) {
process.stdout.write(JSON.stringify(data));
process.exit(0);
}

function sha1(input) {
return crypto.createHash('sha1').update(input).digest('hex');
}

function getPaperWorkspace(paperId) {
return path.join(WORKSPACE_ROOT, paperId);
}

async function writeJson(filePath, obj) {
await fs.writeFile(filePath, JSON.stringify(obj, null, 2), 'utf-8');
}

// ─── Command Handlers ───

async function handleIngestPDF({ filePath, paperId, forceReparse }) {
if (!filePath || typeof filePath !== 'string') {
throw new Error('IngestPDF requires filePath');
}

const abs = path.isAbsolute(filePath) ? filePath : path.resolve(process.cwd(), filePath);
if (!fsSync.existsSync(abs)) {
throw new Error(`PDF not found: ${abs}`);
}

const resolvedPaperId = paperId && String(paperId).trim()
? String(paperId).trim()
: `paper-${sha1(abs).slice(0, 10)}`;

const wsDir = getPaperWorkspace(resolvedPaperId);
const manifestPath = path.join(wsDir, 'chunks', 'manifest.json');
const metaPath = path.join(wsDir, 'meta.json');

// ── Cache check: if manifest + meta already exist, skip re-parsing ──
if (!forceReparse && fsSync.existsSync(manifestPath) && fsSync.existsSync(metaPath)) {
const existingMeta = JSON.parse(await fs.readFile(metaPath, 'utf-8'));
const existingManifest = JSON.parse(await fs.readFile(manifestPath, 'utf-8'));
process.stderr.write(`[PaperReader][Ingest] cache hit: paperId=${resolvedPaperId}, chunkCount=${existingManifest.chunkCount}, engine=${existingMeta.engine}\n`);
return {
paperId: resolvedPaperId,
workspace: wsDir,
pageCount: existingMeta.pageCount,
chunkCount: existingManifest.chunkCount,
engine: existingMeta.engine,
cached: true
};
}

process.stderr.write(`[PaperReader][Ingest] no cache, starting full parse: paperId=${resolvedPaperId}\n`);

await fs.mkdir(wsDir, { recursive: true });

// L0: 解析 PDF → Markdown + Figures
const parsed = await ingestPdf(abs, { outputDir: wsDir });

// Save meta
const meta = {
paperId: resolvedPaperId,
sourceFilePath: abs,
extractedAt: new Date().toISOString(),
pageCount: parsed.pageCount,
textLength: (parsed.markdown || '').length,
engine: parsed.engine
};
await writeJson(metaPath, meta);

// Save full markdown
await fs.writeFile(path.join(wsDir, 'full_text.md'), parsed.markdown || '', 'utf-8');

// Save figure map
if (parsed.figureMap && parsed.figureMap.length > 0) {
await writeJson(path.join(wsDir, 'figure_map.json'), parsed.figureMap);
}

// L1: 章节感知切分
const chunks = chunkMarkdown(parsed.markdown || '');

// Save chunks
const chunksDir = path.join(wsDir, 'chunks');
await fs.mkdir(chunksDir, { recursive: true });

for (const chunk of chunks) {
const chunkContent = chunk.metaHeader
? `${chunk.metaHeader}\n\n---\n\n${chunk.text}`
: chunk.text;
await fs.writeFile(
path.join(chunksDir, `chunk_${chunk.index}.md`),
chunkContent,
'utf-8'
);
}

// Save manifest
const manifest = {
chunkCount: chunks.length,
chunks: chunks.map(c => ({
index: c.index,
section: c.section,
tokenCount: c.tokenCount
}))
};
await writeJson(manifestPath, manifest);

// Create reading_notes dir
await fs.mkdir(path.join(wsDir, 'reading_notes'), { recursive: true });

return {
paperId: resolvedPaperId,
workspace: wsDir,
pageCount: meta.pageCount,
chunkCount: chunks.length,
engine: parsed.engine,
cached: false
};
}

async function handleReadSkeleton({ paperId, focus }) {
if (!paperId) throw new Error('ReadSkeleton requires paperId');
const result = await generateSkeleton(paperId, { focus });
return { paperId, globalMapPath: result.globalMapPath, content: result.globalMapContent };
}

async function handleReadDeep({ paperId, goal, maxChunks, batchSize, forceReread }) {
if (!paperId) throw new Error('ReadDeep requires paperId');
const opts = { goal };
if (maxChunks) opts.maxChunks = maxChunks;
if (batchSize) opts.batchSize = batchSize;
if (forceReread) opts.forceReread = true;
const result = await readDeep(paperId, opts);
// Read the Round_1_Summary.md to return its content
const summaryContent = fsSync.existsSync(result.roundPath)
? (await fs.readFile(result.roundPath, 'utf-8'))
: '';
return { ...result, content: summaryContent };
}

async function handleQuery({ paperId, question }) {
return await queryPaper(paperId, question);
}

// ─── Main ───

async function main() {
let inputData = '';
process.stdin.setEncoding('utf8');
for await (const chunk of process.stdin) inputData += chunk;

const request = JSON.parse(inputData || '{}');
const command = request.command;

process.stderr.write(`[PaperReader][Main] request received: command=${command || 'undefined'}, paperId=${request.paperId || 'n/a'}\n`);

try {
if (!command) throw new Error('Missing command');

let result;
switch (command) {
case 'IngestPDF':
process.stderr.write('[PaperReader][Main] route hit: IngestPDF\n');
result = await handleIngestPDF({ filePath: request.filePath, paperId: request.paperId, forceReparse: request.forceReparse });
break;
case 'ReadSkeleton':
process.stderr.write('[PaperReader][Main] route hit: ReadSkeleton\n');
result = await handleReadSkeleton({ paperId: request.paperId, focus: request.focus });
break;
case 'ReadDeep':
process.stderr.write('[PaperReader][Main] route hit: ReadDeep\n');
result = await handleReadDeep({ paperId: request.paperId, goal: request.goal, maxChunks: request.maxChunks, batchSize: request.batchSize, forceReread: request.forceReread });
break;
case 'Query':
process.stderr.write('[PaperReader][Main] route hit: Query\n');
result = await handleQuery({ paperId: request.paperId, question: request.question });
break;
default:
throw new Error(`Unknown command: ${command}`);
}

sendResponse({ status: 'success', result });
} catch (err) {
process.stderr.write(`[PaperReader][Main] request failed: command=${command || 'undefined'}, error=${err?.message || String(err)}\n`);
sendResponse({ status: 'error', error: err?.message || String(err) });
}
}

main();
68 changes: 68 additions & 0 deletions Plugin/PaperReader/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# PaperReader(v0.2)

## 设计目标

将超长 PDF / 文档转为可控的递归阅读流程。适用于学术论文、技术报告、法律文书、书籍章节等各类长文档。

1. **L0 解析层**:MinerU 云端 API 高保真解析(保留公式/表格/图片/多栏排版),自动降级到 pdf-parse
2. **L1 切分层**:章节感知切分 + Meta-Header 注入 + 10-20% overlap
3. **L2 递归逻辑层**:Skeleton 骨架提取 / Rolling Context 深度阅读 / 合并综合
4. **L3 存储交互层**:Obsidian 友好的 Markdown 目录结构

## 命令

| 命令 | 功能 |
|------|------|
| `IngestPDF` | PDF → Markdown → 章节感知 chunks |
| `ReadSkeleton` | 从目录/摘要/关键章节生成 Global Map |
| `ReadDeep` | 带 Rolling Context 的递归摘要 → Round-1 笔记 |
| `Query` | 检索式问答(关键词匹配 + 章节权重) |

## 工件目录

```
workspace/{paperId}/
├── meta.json # 元数据(含解析引擎标识)
├── full_text.md # 完整 Markdown(L0 输出)
├── figure_map.json # Figure_ID ↔ Caption 映射
├── assets/
│ └── figures/ # 提取的图片
├── chunks/
│ ├── manifest.json # chunk 清单 + 章节映射
│ └── chunk_{i}.md # 单个 chunk(含 Meta-Header)
└── reading_notes/
├── Global_Map.md # 骨架地图
├── Chunk_Summaries.json # 分块摘要
└── Round_1_Summary.md # 深度笔记
```

## 配置

复制 `config.env.example` 为 `config.env` 并填入:
- `MINERU_API_TOKEN`:MinerU 云端 API Token(不填则自动降级)
- `PaperReaderModel`:LLM 模型名称
- 详见 `config.env.example` 中的完整配置项

## 依赖

- `axios`:HTTP 请求
- `pdf-parse`:降级模式 PDF 解析
- `adm-zip`:解压 MinerU 返回的 zip
- `@dqbd/tiktoken`:token 计数
- `dotenv`:环境变量

## 支持的文档类型

MinerU 云端 API 支持解析:
- 学术论文(多栏、公式、引用)
- 技术报告 / 白皮书
- 书籍章节
- 法律文书 / 合同
- 扫描版 PDF(内置 OCR)
- 含复杂表格的文档

## 常见限制

- MinerU 免费额度:每日 2000 页,单文件 200MB/600 页
- Rolling Context 上限 4000 tokens,超出自动压缩
- Query 目前为关键词匹配(向量检索计划在 Phase 2)
37 changes: 37 additions & 0 deletions Plugin/PaperReader/config.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# PaperReader 插件配置(示例)

# === L0 解析层 ===
# MinerU 云端 API Token(从 mineru.net 网站的「个人中心 → API密钥管理」获取)
# 注意:这里填的是 Bearer Token(一串长字符串),不是 Access Key / Secret Key
# 不填则自动降级到 pdf-parse 纯文本模式
MINERU_API_TOKEN=
# MinerU 模型版本:pipeline(默认,速度快)或 vlm(效果更好,速度较慢)
MINERU_MODEL_VERSION=pipeline
# 轮询超时(ms),默认 5 分钟
MINERU_API_TIMEOUT=300000
# 轮询间隔(ms),默认 5 秒
MINERU_POLL_INTERVAL=5000

# === L1 切分层 ===
# 目标 chunk 大小(tokens)
PaperReaderChunkSize=2000
# chunk 重叠比例
PaperReaderOverlap=0.15

# === L2 递归逻辑层 ===
# 读取/总结模型(使用 VCP 的 API_URL/API_Key 调用 /v1/chat/completions)
PaperReaderModel=gemini-2.5-flash-search
# 单次模型输出 token 上限
PaperReaderMaxOutputTokens=12000
# 分批并发组大小(每组处理的 chunk 数,建议 ≤ MaxConcurrentLLM)
# ⚠️ 质量取舍:同批内的 deep chunk 共享同一份 Rolling Context 快照。
# BatchSize=1(串行):上下文递进最强,chunk N 能看到 1..N-1 的所有发现
# BatchSize=5(推荐):速度与质量的甜蜜点
# BatchSize=10+:速度最快,但同批 chunk 无法互相感知(skim 不受影响)
# 极高精度需求(法律/财务逐条审计)建议 ≤3
PaperReaderBatchSize=5
# 进程级 LLM 最大并发请求数(防止 429 风暴,建议 3-8)
# 真正的并发控制由此 semaphore 管理,BatchSize 只控制批内共享上下文的范围
PaperReaderMaxConcurrentLLM=5
# deep 阅读最多处理多少个 chunk(防止成本失控)
PaperReaderMaxChunks=120
Loading