-
Notifications
You must be signed in to change notification settings - Fork 843
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Markdown document reader with enhanced features
This commit introduces a new Markdown document reader with several key features and improvements: * Add support for text with various formatting elements * Implement handling for horizontal rules and hard line breaks * Add functionality for inline and block code sections * Incorporate blockquote handling * Support ordered and unordered lists * Introduce additional metadata capabilities * Include JavaDocs Update ETL documentation to reflect these new features and usage. Fixes #105
- Loading branch information
1 parent
a0ee10f
commit 56e678c
Showing
14 changed files
with
1,356 additions
and
96 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> | ||
<modelVersion>4.0.0</modelVersion> | ||
<parent> | ||
<groupId>org.springframework.ai</groupId> | ||
<artifactId>spring-ai</artifactId> | ||
<version>1.0.0-SNAPSHOT</version> | ||
<relativePath>../../pom.xml</relativePath> | ||
</parent> | ||
<artifactId>spring-ai-markdown-document-reader</artifactId> | ||
<packaging>jar</packaging> | ||
<name>Spring AI Document Reader - Markdown</name> | ||
<description>Spring AI Markdown document reader</description> | ||
<url>https://github.com/spring-projects/spring-ai</url> | ||
|
||
<scm> | ||
<url>https://github.com/spring-projects/spring-ai</url> | ||
<connection>git://github.com/spring-projects/spring-ai.git</connection> | ||
<developerConnection>git@github.com:spring-projects/spring-ai.git</developerConnection> | ||
</scm> | ||
|
||
<dependencies> | ||
<dependency> | ||
<groupId>org.springframework.ai</groupId> | ||
<artifactId>spring-ai-core</artifactId> | ||
<version>${parent.version}</version> | ||
</dependency> | ||
|
||
<dependency> | ||
<groupId>org.commonmark</groupId> | ||
<artifactId>commonmark</artifactId> | ||
<version>${commonmark.version}</version> | ||
</dependency> | ||
|
||
<!-- TESTING --> | ||
<dependency> | ||
<groupId>org.springframework.boot</groupId> | ||
<artifactId>spring-boot-starter-test</artifactId> | ||
<scope>test</scope> | ||
</dependency> | ||
|
||
</dependencies> | ||
|
||
</project> |
207 changes: 207 additions & 0 deletions
207
...n-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,207 @@ | ||
package org.springframework.ai.reader.markdown; | ||
|
||
import org.commonmark.node.*; | ||
import org.commonmark.parser.Parser; | ||
import org.springframework.ai.document.Document; | ||
import org.springframework.ai.document.DocumentReader; | ||
import org.springframework.ai.reader.markdown.config.MarkdownDocumentReaderConfig; | ||
import org.springframework.core.io.DefaultResourceLoader; | ||
import org.springframework.core.io.Resource; | ||
|
||
import java.io.IOException; | ||
import java.io.InputStreamReader; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
/** | ||
* Reads the given Markdown resource and groups headers, paragraphs, or text divided by | ||
* horizontal lines (depending on the | ||
* {@link MarkdownDocumentReaderConfig#horizontalRuleCreateDocument} configuration) into | ||
* {@link Document}s. | ||
* | ||
* @author Piotr Olaszewski | ||
*/ | ||
public class MarkdownDocumentReader implements DocumentReader { | ||
|
||
/** | ||
* The resource points to the Markdown document. | ||
*/ | ||
private final Resource markdownResource; | ||
|
||
/** | ||
* Configuration to a parsing process. | ||
*/ | ||
private final MarkdownDocumentReaderConfig config; | ||
|
||
/** | ||
* Markdown parser. | ||
*/ | ||
private final Parser parser; | ||
|
||
public MarkdownDocumentReader(String markdownResource) { | ||
this(new DefaultResourceLoader().getResource(markdownResource), MarkdownDocumentReaderConfig.defaultConfig()); | ||
} | ||
|
||
public MarkdownDocumentReader(String markdownResource, MarkdownDocumentReaderConfig config) { | ||
this(new DefaultResourceLoader().getResource(markdownResource), config); | ||
} | ||
|
||
public MarkdownDocumentReader(Resource markdownResource, MarkdownDocumentReaderConfig config) { | ||
this.markdownResource = markdownResource; | ||
this.config = config; | ||
this.parser = Parser.builder().build(); | ||
} | ||
|
||
/** | ||
* Extracts and returns a list of documents from the resource. | ||
* @return List of extracted {@link Document} | ||
*/ | ||
@Override | ||
public List<Document> get() { | ||
try (var input = markdownResource.getInputStream()) { | ||
Node node = parser.parseReader(new InputStreamReader(input)); | ||
|
||
DocumentVisitor documentVisitor = new DocumentVisitor(config); | ||
node.accept(documentVisitor); | ||
|
||
return documentVisitor.getDocuments(); | ||
} | ||
catch (IOException e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
|
||
/** | ||
* A convenient class for visiting handled nodes in the Markdown document. | ||
*/ | ||
static class DocumentVisitor extends AbstractVisitor { | ||
|
||
private final List<Document> documents = new ArrayList<>(); | ||
|
||
private final List<String> currentParagraphs = new ArrayList<>(); | ||
|
||
private final MarkdownDocumentReaderConfig config; | ||
|
||
private Document.Builder currentDocumentBuilder; | ||
|
||
public DocumentVisitor(MarkdownDocumentReaderConfig config) { | ||
this.config = config; | ||
} | ||
|
||
@Override | ||
public void visit(org.commonmark.node.Document document) { | ||
currentDocumentBuilder = Document.builder(); | ||
super.visit(document); | ||
} | ||
|
||
@Override | ||
public void visit(Heading heading) { | ||
buildAndFlush(); | ||
super.visit(heading); | ||
} | ||
|
||
@Override | ||
public void visit(ThematicBreak thematicBreak) { | ||
if (config.horizontalRuleCreateDocument) { | ||
buildAndFlush(); | ||
} | ||
super.visit(thematicBreak); | ||
} | ||
|
||
@Override | ||
public void visit(SoftLineBreak softLineBreak) { | ||
translateLineBreakToSpace(); | ||
super.visit(softLineBreak); | ||
} | ||
|
||
@Override | ||
public void visit(HardLineBreak hardLineBreak) { | ||
translateLineBreakToSpace(); | ||
super.visit(hardLineBreak); | ||
} | ||
|
||
@Override | ||
public void visit(ListItem listItem) { | ||
translateLineBreakToSpace(); | ||
super.visit(listItem); | ||
} | ||
|
||
@Override | ||
public void visit(BlockQuote blockQuote) { | ||
if (!config.includeBlockquote) { | ||
buildAndFlush(); | ||
} | ||
|
||
translateLineBreakToSpace(); | ||
currentDocumentBuilder.withMetadata("category", "blockquote"); | ||
super.visit(blockQuote); | ||
} | ||
|
||
@Override | ||
public void visit(Code code) { | ||
currentParagraphs.add(code.getLiteral()); | ||
currentDocumentBuilder.withMetadata("category", "code_inline"); | ||
super.visit(code); | ||
} | ||
|
||
@Override | ||
public void visit(FencedCodeBlock fencedCodeBlock) { | ||
if (!config.includeCodeBlock) { | ||
buildAndFlush(); | ||
} | ||
|
||
translateLineBreakToSpace(); | ||
currentParagraphs.add(fencedCodeBlock.getLiteral()); | ||
currentDocumentBuilder.withMetadata("category", "code_block"); | ||
currentDocumentBuilder.withMetadata("lang", fencedCodeBlock.getInfo()); | ||
|
||
buildAndFlush(); | ||
|
||
super.visit(fencedCodeBlock); | ||
} | ||
|
||
@Override | ||
public void visit(Text text) { | ||
if (text.getParent() instanceof Heading heading) { | ||
currentDocumentBuilder.withMetadata("category", "header_%d".formatted(heading.getLevel())) | ||
.withMetadata("title", text.getLiteral()); | ||
} | ||
else { | ||
currentParagraphs.add(text.getLiteral()); | ||
} | ||
|
||
super.visit(text); | ||
} | ||
|
||
public List<Document> getDocuments() { | ||
buildAndFlush(); | ||
|
||
return documents; | ||
} | ||
|
||
private void buildAndFlush() { | ||
if (!currentParagraphs.isEmpty()) { | ||
String content = String.join("", currentParagraphs); | ||
|
||
Document.Builder builder = currentDocumentBuilder.withContent(content); | ||
|
||
config.additionalMetadata.forEach(builder::withMetadata); | ||
|
||
Document document = builder.build(); | ||
|
||
documents.add(document); | ||
|
||
currentParagraphs.clear(); | ||
} | ||
currentDocumentBuilder = Document.builder(); | ||
} | ||
|
||
private void translateLineBreakToSpace() { | ||
if (!currentParagraphs.isEmpty()) { | ||
currentParagraphs.add(" "); | ||
} | ||
} | ||
|
||
} | ||
|
||
} |
123 changes: 123 additions & 0 deletions
123
...main/java/org/springframework/ai/reader/markdown/config/MarkdownDocumentReaderConfig.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
package org.springframework.ai.reader.markdown.config; | ||
|
||
import org.springframework.ai.document.Document; | ||
import org.springframework.ai.reader.markdown.MarkdownDocumentReader; | ||
import org.springframework.util.Assert; | ||
|
||
import java.util.HashMap; | ||
import java.util.Map; | ||
|
||
/** | ||
* Common configuration for the {@link MarkdownDocumentReader}. | ||
* | ||
* @author Piotr Olaszewski | ||
*/ | ||
public class MarkdownDocumentReaderConfig { | ||
|
||
public final boolean horizontalRuleCreateDocument; | ||
|
||
public final boolean includeCodeBlock; | ||
|
||
public final boolean includeBlockquote; | ||
|
||
public final Map<String, Object> additionalMetadata; | ||
|
||
public MarkdownDocumentReaderConfig(Builder builder) { | ||
horizontalRuleCreateDocument = builder.horizontalRuleCreateDocument; | ||
includeCodeBlock = builder.includeCodeBlock; | ||
includeBlockquote = builder.includeBlockquote; | ||
additionalMetadata = builder.additionalMetadata; | ||
} | ||
|
||
/** | ||
* @return the default configuration | ||
*/ | ||
public static MarkdownDocumentReaderConfig defaultConfig() { | ||
return builder().build(); | ||
} | ||
|
||
public static Builder builder() { | ||
return new Builder(); | ||
} | ||
|
||
public static class Builder { | ||
|
||
private boolean horizontalRuleCreateDocument = false; | ||
|
||
private boolean includeCodeBlock = false; | ||
|
||
private boolean includeBlockquote = false; | ||
|
||
private Map<String, Object> additionalMetadata = new HashMap<>(); | ||
|
||
private Builder() { | ||
} | ||
|
||
/** | ||
* Text divided by horizontal lines will create new {@link Document}s. The default | ||
* is {@code false}, meaning text separated by horizontal lines won't create a new | ||
* document. | ||
* @param horizontalRuleCreateDocument flag to determine whether new documents are | ||
* created from text divided by horizontal line | ||
* @return this builder | ||
*/ | ||
public Builder withHorizontalRuleCreateDocument(boolean horizontalRuleCreateDocument) { | ||
this.horizontalRuleCreateDocument = horizontalRuleCreateDocument; | ||
return this; | ||
} | ||
|
||
/** | ||
* Whatever to include code blocks in {@link Document}s. The default is | ||
* {@code false}, which means all code blocks are in separate documents. | ||
* @param includeCodeBlock flag to include code block into paragraph document or | ||
* create new with code only | ||
* @return this builder | ||
*/ | ||
public Builder withIncludeCodeBlock(boolean includeCodeBlock) { | ||
this.includeCodeBlock = includeCodeBlock; | ||
return this; | ||
} | ||
|
||
/** | ||
* Whatever to include blockquotes in {@link Document}s. The default is | ||
* {@code false}, which means all blockquotes are in separate documents. | ||
* @param includeBlockquote flag to include blockquotes into paragraph document or | ||
* create new with blockquote only | ||
* @return this builder | ||
*/ | ||
public Builder withIncludeBlockquote(boolean includeBlockquote) { | ||
this.includeBlockquote = includeBlockquote; | ||
return this; | ||
} | ||
|
||
/** | ||
* Adds this additional metadata to the all built {@link Document}s. | ||
* @return this builder | ||
*/ | ||
public Builder withAdditionalMetadata(String key, Object value) { | ||
Assert.notNull(key, "key must not be null"); | ||
Assert.notNull(value, "value must not be null"); | ||
this.additionalMetadata.put(key, value); | ||
return this; | ||
} | ||
|
||
/** | ||
* Adds this additional metadata to the all built {@link Document}s. | ||
* @return this builder | ||
*/ | ||
public Builder withAdditionalMetadata(Map<String, Object> additionalMetadata) { | ||
Assert.notNull(additionalMetadata, "additionalMetadata must not be null"); | ||
this.additionalMetadata = additionalMetadata; | ||
return this; | ||
} | ||
|
||
/** | ||
* @return the immutable configuration | ||
*/ | ||
public MarkdownDocumentReaderConfig build() { | ||
return new MarkdownDocumentReaderConfig(this); | ||
} | ||
|
||
} | ||
|
||
} |
Oops, something went wrong.