Skip to content

Commit 4df394d

Browse files
committed
feat(document): add PDF parsing support with PDFBox #463
Introduce PdfDocumentParser using Apache PDFBox for JVM to parse PDF files, extract text, TOC, and page chunks. Register the parser in the document registry and add tests for PDF parsing functionality.
1 parent 0e67981 commit 4df394d

File tree

4 files changed

+262
-1
lines changed

4 files changed

+262
-1
lines changed

mpp-core/build.gradle.kts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,11 @@ kotlin {
185185

186186
// Jsoup for HTML document parsing
187187
implementation("org.jsoup:jsoup:1.21.2")
188+
189+
// PDFBox for PDF document parsing
190+
implementation("org.apache.pdfbox:pdfbox:3.0.3") {
191+
exclude(group = "commons-logging", module = "commons-logging")
192+
}
188193
}
189194
}
190195

mpp-core/src/jvmMain/kotlin/cc/unitmesh/devins/document/DocumentRegistry.jvm.kt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ actual fun platformInitialize() {
1313

1414
// Register Tika parser for multiple formats
1515
val tikaFormats = listOf(
16-
DocumentFormatType.PDF,
1716
DocumentFormatType.DOCX,
1817
DocumentFormatType.PLAIN_TEXT
1918
)
@@ -22,6 +21,10 @@ actual fun platformInitialize() {
2221
DocumentParserFactory.registerParser(format) { TikaDocumentParser() }
2322
logger.debug { "Registered TikaDocumentParser for $format" }
2423
}
24+
25+
// Register PDFBox parser for PDF
26+
DocumentParserFactory.registerParser(DocumentFormatType.PDF) { PdfDocumentParser() }
27+
logger.debug { "Registered PdfDocumentParser for PDF" }
2528

2629
// Register Jsoup parser for HTML
2730
DocumentParserFactory.registerParser(DocumentFormatType.HTML) { JsoupDocumentParser() }
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
package cc.unitmesh.devins.document
2+
3+
import io.github.oshai.kotlinlogging.KotlinLogging
4+
import org.apache.pdfbox.Loader
5+
import org.apache.pdfbox.pdmodel.PDDocument
6+
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem
7+
import org.apache.pdfbox.text.PDFTextStripper
8+
import java.io.File
9+
10+
private val logger = KotlinLogging.logger {}
11+
12+
/**
13+
* Apache PDFBox-based document parser for JVM platform
14+
*/
15+
class PdfDocumentParser : DocumentParserService {
16+
private var currentContent: String? = null
17+
private var currentChunks: List<DocumentChunk> = emptyList()
18+
19+
override fun getDocumentContent(): String? = currentContent
20+
21+
override suspend fun parse(file: DocumentFile, content: String): DocumentTreeNode {
22+
logger.info { "=== Starting PDFBox Parse ===" }
23+
logger.info { "File: ${file.path}" }
24+
25+
val result = try {
26+
val pdfFile = File(file.path)
27+
if (!pdfFile.exists()) {
28+
throw IllegalArgumentException("File not found: ${file.path}")
29+
}
30+
31+
Loader.loadPDF(pdfFile).use { document ->
32+
// Extract full text
33+
val stripper = PDFTextStripper()
34+
val fullText = stripper.getText(document)
35+
currentContent = fullText.trim()
36+
37+
logger.info { "Extracted ${fullText.length} characters" }
38+
39+
// Build chunks by page
40+
currentChunks = buildPageChunks(document, file.path)
41+
logger.info { "Created ${currentChunks.size} document chunks" }
42+
43+
// Extract TOC
44+
val toc = extractTOC(document)
45+
logger.info { "Extracted ${toc.size} TOC items" }
46+
47+
logger.info { "=== Parse Complete ===" }
48+
49+
file.copy(
50+
toc = toc,
51+
metadata = file.metadata.copy(
52+
parseStatus = ParseStatus.PARSED,
53+
chapterCount = toc.size,
54+
totalPages = document.numberOfPages,
55+
mimeType = "application/pdf",
56+
formatType = DocumentFormatType.PDF
57+
)
58+
)
59+
}
60+
} catch (e: Exception) {
61+
logger.error { "Failed to parse PDF: ${e.message}" }
62+
file.copy(
63+
metadata = file.metadata.copy(
64+
parseStatus = ParseStatus.PARSE_FAILED
65+
)
66+
)
67+
}
68+
69+
return result
70+
}
71+
72+
override suspend fun queryHeading(keyword: String): List<DocumentChunk> {
73+
return currentChunks.filter {
74+
it.chapterTitle?.contains(keyword, ignoreCase = true) == true ||
75+
it.content.contains(keyword, ignoreCase = true)
76+
}.sortedByDescending {
77+
// Relevance scoring: title match > content match
78+
when {
79+
it.chapterTitle?.equals(keyword, ignoreCase = true) == true -> 10
80+
it.chapterTitle?.contains(keyword, ignoreCase = true) == true -> 5
81+
else -> 1
82+
}
83+
}
84+
}
85+
86+
override suspend fun queryChapter(chapterId: String): DocumentChunk? {
87+
return currentChunks.find {
88+
it.anchor == chapterId || it.anchor == "#$chapterId"
89+
}
90+
}
91+
92+
private fun buildPageChunks(document: PDDocument, documentPath: String): List<DocumentChunk> {
93+
val chunks = mutableListOf<DocumentChunk>()
94+
val stripper = PDFTextStripper()
95+
96+
for (pageIndex in 0 until document.numberOfPages) {
97+
stripper.startPage = pageIndex + 1
98+
stripper.endPage = pageIndex + 1
99+
100+
try {
101+
val pageText = stripper.getText(document).trim()
102+
if (pageText.isNotEmpty()) {
103+
chunks.add(
104+
DocumentChunk(
105+
documentPath = documentPath,
106+
chapterTitle = "Page ${pageIndex + 1}",
107+
content = pageText,
108+
anchor = "#page-${pageIndex + 1}",
109+
page = pageIndex + 1,
110+
position = PositionMetadata(
111+
documentPath = documentPath,
112+
formatType = DocumentFormatType.PDF,
113+
position = DocumentPosition.PageRange(pageIndex + 1, pageIndex + 1)
114+
)
115+
)
116+
)
117+
}
118+
} catch (e: Exception) {
119+
logger.warn { "Failed to extract text from page ${pageIndex + 1}: ${e.message}" }
120+
}
121+
}
122+
return chunks
123+
}
124+
125+
private fun extractTOC(document: PDDocument): List<TOCItem> {
126+
val outline = document.documentCatalog.documentOutline ?: return emptyList()
127+
val toc = mutableListOf<TOCItem>()
128+
129+
var currentItem = outline.firstChild
130+
while (currentItem != null) {
131+
processOutlineItem(currentItem, 1, toc)
132+
currentItem = currentItem.nextSibling
133+
}
134+
135+
return toc
136+
}
137+
138+
private fun processOutlineItem(item: PDOutlineItem, level: Int, list: MutableList<TOCItem>) {
139+
val title = item.title ?: "Untitled"
140+
val children = mutableListOf<TOCItem>()
141+
142+
var child = item.firstChild
143+
while (child != null) {
144+
processOutlineItem(child, level + 1, children)
145+
child = child.nextSibling
146+
}
147+
148+
list.add(TOCItem(
149+
level = level,
150+
title = title,
151+
anchor = "#${title.lowercase().replace(Regex("[^a-z0-9]+"), "-")}",
152+
children = children
153+
))
154+
}
155+
}
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
package cc.unitmesh.devins.document
2+
3+
import kotlinx.coroutines.test.runTest
4+
import org.junit.Before
5+
import org.junit.Test
6+
import java.io.File
7+
import kotlin.test.assertEquals
8+
import kotlin.test.assertNotNull
9+
import kotlin.test.assertTrue
10+
11+
class PdfDocumentParserTest {
12+
13+
private lateinit var parser: PdfDocumentParser
14+
15+
@Before
16+
fun setup() {
17+
parser = PdfDocumentParser()
18+
// Initialize platform parsers to ensure factory is set up (though we use parser directly here)
19+
DocumentRegistry.initializePlatformParsers()
20+
}
21+
22+
@Test
23+
fun `should parse PDF document with content verification`() = runTest {
24+
// Given
25+
val tempFile = createTempFileFromResource("sample2.pdf")
26+
val documentFile = createDocumentFile(tempFile.name, tempFile.absolutePath, tempFile.length(), DocumentFormatType.PDF)
27+
28+
// When
29+
val result = parser.parse(documentFile, "") // Content is ignored by PdfDocumentParser
30+
31+
// Then
32+
assertTrue(result is DocumentFile)
33+
assertEquals(ParseStatus.PARSED, result.metadata.parseStatus)
34+
assertEquals(DocumentFormatType.PDF, result.metadata.formatType)
35+
36+
val extractedContent = parser.getDocumentContent()
37+
assertNotNull(extractedContent)
38+
// Verify content from sample2.pdf
39+
assertTrue(extractedContent.contains("Consult doc/pdftex/manual.pdf"), "Should contain specific text from PDF")
40+
41+
// Check chunks
42+
val chunks = parser.queryHeading("")
43+
assertTrue(chunks.isNotEmpty(), "Should create chunks")
44+
45+
val firstChunk = chunks.first()
46+
assertNotNull(firstChunk.position)
47+
assertEquals(tempFile.absolutePath, firstChunk.position?.documentPath)
48+
assertTrue(firstChunk.position?.position is DocumentPosition.PageRange, "Position should be PageRange")
49+
50+
val pageRange = firstChunk.position?.position as DocumentPosition.PageRange
51+
assertTrue(pageRange.startPage > 0)
52+
53+
println("✓ PDF parsed successfully: ${extractedContent.length} chars")
54+
}
55+
56+
@Test
57+
fun `should register PdfDocumentParser in factory`() {
58+
// When
59+
val pdfParser = DocumentParserFactory.createParser(DocumentFormatType.PDF)
60+
61+
// Then
62+
assertNotNull(pdfParser)
63+
assertTrue(pdfParser is PdfDocumentParser, "Factory should return PdfDocumentParser for PDF")
64+
65+
println("✓ DocumentParserFactory integration verified")
66+
}
67+
68+
private fun createTempFileFromResource(fileName: String): File {
69+
val inputStream = javaClass.classLoader.getResourceAsStream(fileName)
70+
?: throw IllegalArgumentException("Resource not found: $fileName")
71+
72+
// Create temp file with .pdf extension
73+
val tempFile = File.createTempFile("test-", ".pdf")
74+
tempFile.deleteOnExit()
75+
76+
tempFile.outputStream().use { output ->
77+
inputStream.copyTo(output)
78+
}
79+
return tempFile
80+
}
81+
82+
private fun createDocumentFile(
83+
name: String,
84+
path: String,
85+
size: Long,
86+
formatType: DocumentFormatType
87+
): DocumentFile {
88+
return DocumentFile(
89+
name = name,
90+
path = path,
91+
metadata = DocumentMetadata(
92+
lastModified = System.currentTimeMillis(),
93+
fileSize = size,
94+
formatType = formatType
95+
)
96+
)
97+
}
98+
}

0 commit comments

Comments
 (0)