Skip to content

Commit

Permalink
Add Markdown document reader with enhanced features
Browse files Browse the repository at this point in the history
This commit introduces a new Markdown document reader with several
key features and improvements:

* Add support for text with various formatting elements
* Implement handling for horizontal rules and hard line breaks
* Add functionality for inline and block code sections
* Incorporate blockquote handling
* Support ordered and unordered lists
* Introduce additional metadata capabilities
* Include JavaDocs

Update ETL documentation to reflect these new features and usage.

Fixes #105
  • Loading branch information
piotrooo authored and markpollack committed Aug 22, 2024
1 parent a0ee10f commit 56e678c
Show file tree
Hide file tree
Showing 14 changed files with 1,356 additions and 96 deletions.
46 changes: 46 additions & 0 deletions document-readers/markdown-reader/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai</artifactId>
<version>1.0.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>
<artifactId>spring-ai-markdown-document-reader</artifactId>
<packaging>jar</packaging>
<name>Spring AI Document Reader - Markdown</name>
<description>Spring AI Markdown document reader</description>
<url>https://github.com/spring-projects/spring-ai</url>

<scm>
<url>https://github.com/spring-projects/spring-ai</url>
<connection>git://github.com/spring-projects/spring-ai.git</connection>
<developerConnection>git@github.com:spring-projects/spring-ai.git</developerConnection>
</scm>

<dependencies>
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-core</artifactId>
<version>${parent.version}</version>
</dependency>

<dependency>
<groupId>org.commonmark</groupId>
<artifactId>commonmark</artifactId>
<version>${commonmark.version}</version>
</dependency>

<!-- TESTING -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>

</dependencies>

</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
package org.springframework.ai.reader.markdown;

import org.commonmark.node.*;
import org.commonmark.parser.Parser;
import org.springframework.ai.document.Document;
import org.springframework.ai.document.DocumentReader;
import org.springframework.ai.reader.markdown.config.MarkdownDocumentReaderConfig;
import org.springframework.core.io.DefaultResourceLoader;
import org.springframework.core.io.Resource;

import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;

/**
* Reads the given Markdown resource and groups headers, paragraphs, or text divided by
* horizontal lines (depending on the
* {@link MarkdownDocumentReaderConfig#horizontalRuleCreateDocument} configuration) into
* {@link Document}s.
*
* @author Piotr Olaszewski
*/
public class MarkdownDocumentReader implements DocumentReader {

/**
* The resource points to the Markdown document.
*/
private final Resource markdownResource;

/**
* Configuration to a parsing process.
*/
private final MarkdownDocumentReaderConfig config;

/**
* Markdown parser.
*/
private final Parser parser;

public MarkdownDocumentReader(String markdownResource) {
this(new DefaultResourceLoader().getResource(markdownResource), MarkdownDocumentReaderConfig.defaultConfig());
}

public MarkdownDocumentReader(String markdownResource, MarkdownDocumentReaderConfig config) {
this(new DefaultResourceLoader().getResource(markdownResource), config);
}

public MarkdownDocumentReader(Resource markdownResource, MarkdownDocumentReaderConfig config) {
this.markdownResource = markdownResource;
this.config = config;
this.parser = Parser.builder().build();
}

/**
* Extracts and returns a list of documents from the resource.
* @return List of extracted {@link Document}
*/
@Override
public List<Document> get() {
try (var input = markdownResource.getInputStream()) {
Node node = parser.parseReader(new InputStreamReader(input));

DocumentVisitor documentVisitor = new DocumentVisitor(config);
node.accept(documentVisitor);

return documentVisitor.getDocuments();
}
catch (IOException e) {
throw new RuntimeException(e);
}
}

/**
* A convenient class for visiting handled nodes in the Markdown document.
*/
static class DocumentVisitor extends AbstractVisitor {

private final List<Document> documents = new ArrayList<>();

private final List<String> currentParagraphs = new ArrayList<>();

private final MarkdownDocumentReaderConfig config;

private Document.Builder currentDocumentBuilder;

public DocumentVisitor(MarkdownDocumentReaderConfig config) {
this.config = config;
}

@Override
public void visit(org.commonmark.node.Document document) {
currentDocumentBuilder = Document.builder();
super.visit(document);
}

@Override
public void visit(Heading heading) {
buildAndFlush();
super.visit(heading);
}

@Override
public void visit(ThematicBreak thematicBreak) {
if (config.horizontalRuleCreateDocument) {
buildAndFlush();
}
super.visit(thematicBreak);
}

@Override
public void visit(SoftLineBreak softLineBreak) {
translateLineBreakToSpace();
super.visit(softLineBreak);
}

@Override
public void visit(HardLineBreak hardLineBreak) {
translateLineBreakToSpace();
super.visit(hardLineBreak);
}

@Override
public void visit(ListItem listItem) {
translateLineBreakToSpace();
super.visit(listItem);
}

@Override
public void visit(BlockQuote blockQuote) {
if (!config.includeBlockquote) {
buildAndFlush();
}

translateLineBreakToSpace();
currentDocumentBuilder.withMetadata("category", "blockquote");
super.visit(blockQuote);
}

@Override
public void visit(Code code) {
currentParagraphs.add(code.getLiteral());
currentDocumentBuilder.withMetadata("category", "code_inline");
super.visit(code);
}

@Override
public void visit(FencedCodeBlock fencedCodeBlock) {
if (!config.includeCodeBlock) {
buildAndFlush();
}

translateLineBreakToSpace();
currentParagraphs.add(fencedCodeBlock.getLiteral());
currentDocumentBuilder.withMetadata("category", "code_block");
currentDocumentBuilder.withMetadata("lang", fencedCodeBlock.getInfo());

buildAndFlush();

super.visit(fencedCodeBlock);
}

@Override
public void visit(Text text) {
if (text.getParent() instanceof Heading heading) {
currentDocumentBuilder.withMetadata("category", "header_%d".formatted(heading.getLevel()))
.withMetadata("title", text.getLiteral());
}
else {
currentParagraphs.add(text.getLiteral());
}

super.visit(text);
}

public List<Document> getDocuments() {
buildAndFlush();

return documents;
}

private void buildAndFlush() {
if (!currentParagraphs.isEmpty()) {
String content = String.join("", currentParagraphs);

Document.Builder builder = currentDocumentBuilder.withContent(content);

config.additionalMetadata.forEach(builder::withMetadata);

Document document = builder.build();

documents.add(document);

currentParagraphs.clear();
}
currentDocumentBuilder = Document.builder();
}

private void translateLineBreakToSpace() {
if (!currentParagraphs.isEmpty()) {
currentParagraphs.add(" ");
}
}

}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
package org.springframework.ai.reader.markdown.config;

import org.springframework.ai.document.Document;
import org.springframework.ai.reader.markdown.MarkdownDocumentReader;
import org.springframework.util.Assert;

import java.util.HashMap;
import java.util.Map;

/**
* Common configuration for the {@link MarkdownDocumentReader}.
*
* @author Piotr Olaszewski
*/
public class MarkdownDocumentReaderConfig {

public final boolean horizontalRuleCreateDocument;

public final boolean includeCodeBlock;

public final boolean includeBlockquote;

public final Map<String, Object> additionalMetadata;

public MarkdownDocumentReaderConfig(Builder builder) {
horizontalRuleCreateDocument = builder.horizontalRuleCreateDocument;
includeCodeBlock = builder.includeCodeBlock;
includeBlockquote = builder.includeBlockquote;
additionalMetadata = builder.additionalMetadata;
}

/**
* @return the default configuration
*/
public static MarkdownDocumentReaderConfig defaultConfig() {
return builder().build();
}

public static Builder builder() {
return new Builder();
}

public static class Builder {

private boolean horizontalRuleCreateDocument = false;

private boolean includeCodeBlock = false;

private boolean includeBlockquote = false;

private Map<String, Object> additionalMetadata = new HashMap<>();

private Builder() {
}

/**
* Text divided by horizontal lines will create new {@link Document}s. The default
* is {@code false}, meaning text separated by horizontal lines won't create a new
* document.
* @param horizontalRuleCreateDocument flag to determine whether new documents are
* created from text divided by horizontal line
* @return this builder
*/
public Builder withHorizontalRuleCreateDocument(boolean horizontalRuleCreateDocument) {
this.horizontalRuleCreateDocument = horizontalRuleCreateDocument;
return this;
}

/**
* Whatever to include code blocks in {@link Document}s. The default is
* {@code false}, which means all code blocks are in separate documents.
* @param includeCodeBlock flag to include code block into paragraph document or
* create new with code only
* @return this builder
*/
public Builder withIncludeCodeBlock(boolean includeCodeBlock) {
this.includeCodeBlock = includeCodeBlock;
return this;
}

/**
* Whatever to include blockquotes in {@link Document}s. The default is
* {@code false}, which means all blockquotes are in separate documents.
* @param includeBlockquote flag to include blockquotes into paragraph document or
* create new with blockquote only
* @return this builder
*/
public Builder withIncludeBlockquote(boolean includeBlockquote) {
this.includeBlockquote = includeBlockquote;
return this;
}

/**
* Adds this additional metadata to the all built {@link Document}s.
* @return this builder
*/
public Builder withAdditionalMetadata(String key, Object value) {
Assert.notNull(key, "key must not be null");
Assert.notNull(value, "value must not be null");
this.additionalMetadata.put(key, value);
return this;
}

/**
* Adds this additional metadata to the all built {@link Document}s.
* @return this builder
*/
public Builder withAdditionalMetadata(Map<String, Object> additionalMetadata) {
Assert.notNull(additionalMetadata, "additionalMetadata must not be null");
this.additionalMetadata = additionalMetadata;
return this;
}

/**
* @return the immutable configuration
*/
public MarkdownDocumentReaderConfig build() {
return new MarkdownDocumentReaderConfig(this);
}

}

}
Loading

0 comments on commit 56e678c

Please sign in to comment.