feat(document): add PDF parsing support with PDFBox #463

phodal · phodal · commit 4df394d58184 · 2025-11-24T23:36:54.000+08:00
Introduce PdfDocumentParser using Apache PDFBox for JVM to parse PDF files, extract text, TOC, and page chunks. Register the parser in the document registry and add tests for PDF parsing functionality.
diff --git a/mpp-core/build.gradle.kts b/mpp-core/build.gradle.kts
@@ -185,6 +185,11 @@ kotlin {
                 
                 // Jsoup for HTML document parsing
                 implementation("org.jsoup:jsoup:1.21.2")
+
+                // PDFBox for PDF document parsing
+                implementation("org.apache.pdfbox:pdfbox:3.0.3") {
+                    exclude(group = "commons-logging", module = "commons-logging")
+                }
             }
         }
 
diff --git a/mpp-core/src/jvmMain/kotlin/cc/unitmesh/devins/document/DocumentRegistry.jvm.kt b/mpp-core/src/jvmMain/kotlin/cc/unitmesh/devins/document/DocumentRegistry.jvm.kt
@@ -13,7 +13,6 @@ actual fun platformInitialize() {
     
     // Register Tika parser for multiple formats
     val tikaFormats = listOf(
-        DocumentFormatType.PDF,
         DocumentFormatType.DOCX,
         DocumentFormatType.PLAIN_TEXT
     )
@@ -22,6 +21,10 @@ actual fun platformInitialize() {
         DocumentParserFactory.registerParser(format) { TikaDocumentParser() }
         logger.debug { "Registered TikaDocumentParser for $format" }
     }
+
+    // Register PDFBox parser for PDF
+    DocumentParserFactory.registerParser(DocumentFormatType.PDF) { PdfDocumentParser() }
+    logger.debug { "Registered PdfDocumentParser for PDF" }
     
     // Register Jsoup parser for HTML
     DocumentParserFactory.registerParser(DocumentFormatType.HTML) { JsoupDocumentParser() }
diff --git a/mpp-core/src/jvmMain/kotlin/cc/unitmesh/devins/document/PdfDocumentParser.kt b/mpp-core/src/jvmMain/kotlin/cc/unitmesh/devins/document/PdfDocumentParser.kt
@@ -0,0 +1,155 @@
+package cc.unitmesh.devins.document
+
+import io.github.oshai.kotlinlogging.KotlinLogging
+import org.apache.pdfbox.Loader
+import org.apache.pdfbox.pdmodel.PDDocument
+import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem
+import org.apache.pdfbox.text.PDFTextStripper
+import java.io.File
+
+private val logger = KotlinLogging.logger {}
+
+/**
+ * Apache PDFBox-based document parser for JVM platform
+ */
+class PdfDocumentParser : DocumentParserService {
+    private var currentContent: String? = null
+    private var currentChunks: List<DocumentChunk> = emptyList()
+
+    override fun getDocumentContent(): String? = currentContent
+
+    override suspend fun parse(file: DocumentFile, content: String): DocumentTreeNode {
+        logger.info { "=== Starting PDFBox Parse ===" }
+        logger.info { "File: ${file.path}" }
+
+        val result = try {
+            val pdfFile = File(file.path)
+            if (!pdfFile.exists()) {
+                throw IllegalArgumentException("File not found: ${file.path}")
+            }
+
+            Loader.loadPDF(pdfFile).use { document ->
+                // Extract full text
+                val stripper = PDFTextStripper()
+                val fullText = stripper.getText(document)
+                currentContent = fullText.trim()
+
+                logger.info { "Extracted ${fullText.length} characters" }
+
+                // Build chunks by page
+                currentChunks = buildPageChunks(document, file.path)
+                logger.info { "Created ${currentChunks.size} document chunks" }
+
+                // Extract TOC
+                val toc = extractTOC(document)
+                logger.info { "Extracted ${toc.size} TOC items" }
+
+                logger.info { "=== Parse Complete ===" }
+
+                file.copy(
+                    toc = toc,
+                    metadata = file.metadata.copy(
+                        parseStatus = ParseStatus.PARSED,
+                        chapterCount = toc.size,
+                        totalPages = document.numberOfPages,
+                        mimeType = "application/pdf",
+                        formatType = DocumentFormatType.PDF
+                    )
+                )
+            }
+        } catch (e: Exception) {
+            logger.error { "Failed to parse PDF: ${e.message}" }
+            file.copy(
+                metadata = file.metadata.copy(
+                    parseStatus = ParseStatus.PARSE_FAILED
+                )
+            )
+        }
+        
+        return result
+    }
+
+    override suspend fun queryHeading(keyword: String): List<DocumentChunk> {
+        return currentChunks.filter {
+            it.chapterTitle?.contains(keyword, ignoreCase = true) == true ||
+                    it.content.contains(keyword, ignoreCase = true)
+        }.sortedByDescending {
+            // Relevance scoring: title match > content match
+            when {
+                it.chapterTitle?.equals(keyword, ignoreCase = true) == true -> 10
+                it.chapterTitle?.contains(keyword, ignoreCase = true) == true -> 5
+                else -> 1
+            }
+        }
+    }
+
+    override suspend fun queryChapter(chapterId: String): DocumentChunk? {
+        return currentChunks.find {
+            it.anchor == chapterId || it.anchor == "#$chapterId"
+        }
+    }
+
+    private fun buildPageChunks(document: PDDocument, documentPath: String): List<DocumentChunk> {
+        val chunks = mutableListOf<DocumentChunk>()
+        val stripper = PDFTextStripper()
+
+        for (pageIndex in 0 until document.numberOfPages) {
+            stripper.startPage = pageIndex + 1
+            stripper.endPage = pageIndex + 1
+            
+            try {
+                val pageText = stripper.getText(document).trim()
+                if (pageText.isNotEmpty()) {
+                    chunks.add(
+                        DocumentChunk(
+                            documentPath = documentPath,
+                            chapterTitle = "Page ${pageIndex + 1}",
+                            content = pageText,
+                            anchor = "#page-${pageIndex + 1}",
+                            page = pageIndex + 1,
+                            position = PositionMetadata(
+                                documentPath = documentPath,
+                                formatType = DocumentFormatType.PDF,
+                                position = DocumentPosition.PageRange(pageIndex + 1, pageIndex + 1)
+                            )
+                        )
+                    )
+                }
+            } catch (e: Exception) {
+                logger.warn { "Failed to extract text from page ${pageIndex + 1}: ${e.message}" }
+            }
+        }
+        return chunks
+    }
+
+    private fun extractTOC(document: PDDocument): List<TOCItem> {
+        val outline = document.documentCatalog.documentOutline ?: return emptyList()
+        val toc = mutableListOf<TOCItem>()
+        
+        var currentItem = outline.firstChild
+        while (currentItem != null) {
+            processOutlineItem(currentItem, 1, toc)
+            currentItem = currentItem.nextSibling
+        }
+        
+        return toc
+    }
+
+    private fun processOutlineItem(item: PDOutlineItem, level: Int, list: MutableList<TOCItem>) {
+        val title = item.title ?: "Untitled"
+        val children = mutableListOf<TOCItem>()
+        
+        var child = item.firstChild
+        while (child != null) {
+            processOutlineItem(child, level + 1, children)
+            child = child.nextSibling
+        }
+
+        list.add(TOCItem(
+            level = level,
+            title = title,
+            anchor = "#${title.lowercase().replace(Regex("[^a-z0-9]+"), "-")}",
+            children = children
+        ))
+    }
+}
diff --git a/mpp-core/src/jvmTest/kotlin/cc/unitmesh/devins/document/PdfDocumentParserTest.kt b/mpp-core/src/jvmTest/kotlin/cc/unitmesh/devins/document/PdfDocumentParserTest.kt
@@ -0,0 +1,98 @@
+package cc.unitmesh.devins.document
+
+import kotlinx.coroutines.test.runTest
+import org.junit.Before
+import org.junit.Test
+import java.io.File
+import kotlin.test.assertEquals
+import kotlin.test.assertNotNull
+import kotlin.test.assertTrue
+
+class PdfDocumentParserTest {
+
+    private lateinit var parser: PdfDocumentParser
+
+    @Before
+    fun setup() {
+        parser = PdfDocumentParser()
+        // Initialize platform parsers to ensure factory is set up (though we use parser directly here)
+        DocumentRegistry.initializePlatformParsers()
+    }
+
+    @Test
+    fun `should parse PDF document with content verification`() = runTest {
+        // Given
+        val tempFile = createTempFileFromResource("sample2.pdf")
+        val documentFile = createDocumentFile(tempFile.name, tempFile.absolutePath, tempFile.length(), DocumentFormatType.PDF)
+
+        // When
+        val result = parser.parse(documentFile, "") // Content is ignored by PdfDocumentParser
+
+        // Then
+        assertTrue(result is DocumentFile)
+        assertEquals(ParseStatus.PARSED, result.metadata.parseStatus)
+        assertEquals(DocumentFormatType.PDF, result.metadata.formatType)
+        
+        val extractedContent = parser.getDocumentContent()
+        assertNotNull(extractedContent)
+        // Verify content from sample2.pdf
+        assertTrue(extractedContent.contains("Consult doc/pdftex/manual.pdf"), "Should contain specific text from PDF")
+        
+        // Check chunks
+        val chunks = parser.queryHeading("")
+        assertTrue(chunks.isNotEmpty(), "Should create chunks")
+        
+        val firstChunk = chunks.first()
+        assertNotNull(firstChunk.position)
+        assertEquals(tempFile.absolutePath, firstChunk.position?.documentPath)
+        assertTrue(firstChunk.position?.position is DocumentPosition.PageRange, "Position should be PageRange")
+        
+        val pageRange = firstChunk.position?.position as DocumentPosition.PageRange
+        assertTrue(pageRange.startPage > 0)
+        
+        println("✓ PDF parsed successfully: ${extractedContent.length} chars")
+    }
+
+    @Test
+    fun `should register PdfDocumentParser in factory`() {
+        // When
+        val pdfParser = DocumentParserFactory.createParser(DocumentFormatType.PDF)
+        
+        // Then
+        assertNotNull(pdfParser)
+        assertTrue(pdfParser is PdfDocumentParser, "Factory should return PdfDocumentParser for PDF")
+        
+        println("✓ DocumentParserFactory integration verified")
+    }
+
+    private fun createTempFileFromResource(fileName: String): File {
+        val inputStream = javaClass.classLoader.getResourceAsStream(fileName)
+            ?: throw IllegalArgumentException("Resource not found: $fileName")
+        
+        // Create temp file with .pdf extension
+        val tempFile = File.createTempFile("test-", ".pdf")
+        tempFile.deleteOnExit()
+        
+        tempFile.outputStream().use { output ->
+            inputStream.copyTo(output)
+        }
+        return tempFile
+    }
+
+    private fun createDocumentFile(
+        name: String,
+        path: String,
+        size: Long,
+        formatType: DocumentFormatType
+    ): DocumentFile {
+        return DocumentFile(
+            name = name,
+            path = path,
+            metadata = DocumentMetadata(
+                lastModified = System.currentTimeMillis(),
+                fileSize = size,
+                formatType = formatType
+            )
+        )
+    }
+}

Original file line number	Diff line number	Diff line change
`@@ -185,6 +185,11 @@ kotlin {`
`185`	`185`
`186`	`186`	`// Jsoup for HTML document parsing`
`187`	`187`	`implementation("org.jsoup:jsoup:1.21.2")`
	`188`	`+`
	`189`	`+ // PDFBox for PDF document parsing`
	`190`	`+ implementation("org.apache.pdfbox:pdfbox:3.0.3") {`
	`191`	`+ exclude(group = "commons-logging", module = "commons-logging")`
	`192`	`+ }`
`188`	`193`	`}`
`189`	`194`	`}`
`190`	`195`