Add Markdown document reader with enhanced features

This commit introduces a new Markdown document reader with several key features and improvements: * Add support for text with various formatting elements * Implement handling for horizontal rules and hard line breaks * Add functionality for inline and block code sections * Incorporate blockquote handling * Support ordered and unordered lists * Introduce additional metadata capabilities * Include JavaDocs Update ETL documentation to reflect these new features and usage. Fixes #105
spring-projects · Aug 22, 2024 · 56e678c · 56e678c
1 parent a0ee10f
commit 56e678c
Show file tree

Hide file tree

Showing 14 changed files with 1,356 additions and 96 deletions.
diff --git a/document-readers/markdown-reader/pom.xml b/document-readers/markdown-reader/pom.xml
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+		 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+		 xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+	<modelVersion>4.0.0</modelVersion>
+	<parent>
+		<groupId>org.springframework.ai</groupId>
+		<artifactId>spring-ai</artifactId>
+		<version>1.0.0-SNAPSHOT</version>
+		<relativePath>../../pom.xml</relativePath>
+	</parent>
+	<artifactId>spring-ai-markdown-document-reader</artifactId>
+	<packaging>jar</packaging>
+	<name>Spring AI Document Reader - Markdown</name>
+	<description>Spring AI Markdown document reader</description>
+	<url>https://github.com/spring-projects/spring-ai</url>
+
+	<scm>
+		<url>https://github.com/spring-projects/spring-ai</url>
+		<connection>git://github.com/spring-projects/spring-ai.git</connection>
+		<developerConnection>git@github.com:spring-projects/spring-ai.git</developerConnection>
+	</scm>
+
+	<dependencies>
+		<dependency>
+			<groupId>org.springframework.ai</groupId>
+			<artifactId>spring-ai-core</artifactId>
+			<version>${parent.version}</version>
+		</dependency>
+
+		<dependency>
+			<groupId>org.commonmark</groupId>
+			<artifactId>commonmark</artifactId>
+			<version>${commonmark.version}</version>
+		</dependency>
+
+		<!-- TESTING -->
+		<dependency>
+			<groupId>org.springframework.boot</groupId>
+			<artifactId>spring-boot-starter-test</artifactId>
+			<scope>test</scope>
+		</dependency>
+
+	</dependencies>
+
+</project>
diff --git a/...n-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java b/...n-reader/src/main/java/org/springframework/ai/reader/markdown/MarkdownDocumentReader.java
@@ -0,0 +1,207 @@
+package org.springframework.ai.reader.markdown;
+
+import org.commonmark.node.*;
+import org.commonmark.parser.Parser;
+import org.springframework.ai.document.Document;
+import org.springframework.ai.document.DocumentReader;
+import org.springframework.ai.reader.markdown.config.MarkdownDocumentReaderConfig;
+import org.springframework.core.io.DefaultResourceLoader;
+import org.springframework.core.io.Resource;
+
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Reads the given Markdown resource and groups headers, paragraphs, or text divided by
+ * horizontal lines (depending on the
+ * {@link MarkdownDocumentReaderConfig#horizontalRuleCreateDocument} configuration) into
+ * {@link Document}s.
+ *
+ * @author Piotr Olaszewski
+ */
+public class MarkdownDocumentReader implements DocumentReader {
+
+	/**
+	 * The resource points to the Markdown document.
+	 */
+	private final Resource markdownResource;
+
+	/**
+	 * Configuration to a parsing process.
+	 */
+	private final MarkdownDocumentReaderConfig config;
+
+	/**
+	 * Markdown parser.
+	 */
+	private final Parser parser;
+
+	public MarkdownDocumentReader(String markdownResource) {
+		this(new DefaultResourceLoader().getResource(markdownResource), MarkdownDocumentReaderConfig.defaultConfig());
+	}
+
+	public MarkdownDocumentReader(String markdownResource, MarkdownDocumentReaderConfig config) {
+		this(new DefaultResourceLoader().getResource(markdownResource), config);
+	}
+
+	public MarkdownDocumentReader(Resource markdownResource, MarkdownDocumentReaderConfig config) {
+		this.markdownResource = markdownResource;
+		this.config = config;
+		this.parser = Parser.builder().build();
+	}
+
+	/**
+	 * Extracts and returns a list of documents from the resource.
+	 * @return List of extracted {@link Document}
+	 */
+	@Override
+	public List<Document> get() {
+		try (var input = markdownResource.getInputStream()) {
+			Node node = parser.parseReader(new InputStreamReader(input));
+
+			DocumentVisitor documentVisitor = new DocumentVisitor(config);
+			node.accept(documentVisitor);
+
+			return documentVisitor.getDocuments();
+		}
+		catch (IOException e) {
+			throw new RuntimeException(e);
+		}
+	}
+
+	/**
+	 * A convenient class for visiting handled nodes in the Markdown document.
+	 */
+	static class DocumentVisitor extends AbstractVisitor {
+
+		private final List<Document> documents = new ArrayList<>();
+
+		private final List<String> currentParagraphs = new ArrayList<>();
+
+		private final MarkdownDocumentReaderConfig config;
+
+		private Document.Builder currentDocumentBuilder;
+
+		public DocumentVisitor(MarkdownDocumentReaderConfig config) {
+			this.config = config;
+		}
+
+		@Override
+		public void visit(org.commonmark.node.Document document) {
+			currentDocumentBuilder = Document.builder();
+			super.visit(document);
+		}
+
+		@Override
+		public void visit(Heading heading) {
+			buildAndFlush();
+			super.visit(heading);
+		}
+
+		@Override
+		public void visit(ThematicBreak thematicBreak) {
+			if (config.horizontalRuleCreateDocument) {
+				buildAndFlush();
+			}
+			super.visit(thematicBreak);
+		}
+
+		@Override
+		public void visit(SoftLineBreak softLineBreak) {
+			translateLineBreakToSpace();
+			super.visit(softLineBreak);
+		}
+
+		@Override
+		public void visit(HardLineBreak hardLineBreak) {
+			translateLineBreakToSpace();
+			super.visit(hardLineBreak);
+		}
+
+		@Override
+		public void visit(ListItem listItem) {
+			translateLineBreakToSpace();
+			super.visit(listItem);
+		}
+
+		@Override
+		public void visit(BlockQuote blockQuote) {
+			if (!config.includeBlockquote) {
+				buildAndFlush();
+			}
+
+			translateLineBreakToSpace();
+			currentDocumentBuilder.withMetadata("category", "blockquote");
+			super.visit(blockQuote);
+		}
+
+		@Override
+		public void visit(Code code) {
+			currentParagraphs.add(code.getLiteral());
+			currentDocumentBuilder.withMetadata("category", "code_inline");
+			super.visit(code);
+		}
+
+		@Override
+		public void visit(FencedCodeBlock fencedCodeBlock) {
+			if (!config.includeCodeBlock) {
+				buildAndFlush();
+			}
+
+			translateLineBreakToSpace();
+			currentParagraphs.add(fencedCodeBlock.getLiteral());
+			currentDocumentBuilder.withMetadata("category", "code_block");
+			currentDocumentBuilder.withMetadata("lang", fencedCodeBlock.getInfo());
+
+			buildAndFlush();
+
+			super.visit(fencedCodeBlock);
+		}
+
+		@Override
+		public void visit(Text text) {
+			if (text.getParent() instanceof Heading heading) {
+				currentDocumentBuilder.withMetadata("category", "header_%d".formatted(heading.getLevel()))
+					.withMetadata("title", text.getLiteral());
+			}
+			else {
+				currentParagraphs.add(text.getLiteral());
+			}
+
+			super.visit(text);
+		}
+
+		public List<Document> getDocuments() {
+			buildAndFlush();
+
+			return documents;
+		}
+
+		private void buildAndFlush() {
+			if (!currentParagraphs.isEmpty()) {
+				String content = String.join("", currentParagraphs);
+
+				Document.Builder builder = currentDocumentBuilder.withContent(content);
+
+				config.additionalMetadata.forEach(builder::withMetadata);
+
+				Document document = builder.build();
+
+				documents.add(document);
+
+				currentParagraphs.clear();
+			}
+			currentDocumentBuilder = Document.builder();
+		}
+
+		private void translateLineBreakToSpace() {
+			if (!currentParagraphs.isEmpty()) {
+				currentParagraphs.add(" ");
+			}
+		}
+
+	}
+
+}
diff --git a/...main/java/org/springframework/ai/reader/markdown/config/MarkdownDocumentReaderConfig.java b/...main/java/org/springframework/ai/reader/markdown/config/MarkdownDocumentReaderConfig.java
@@ -0,0 +1,123 @@
+package org.springframework.ai.reader.markdown.config;
+
+import org.springframework.ai.document.Document;
+import org.springframework.ai.reader.markdown.MarkdownDocumentReader;
+import org.springframework.util.Assert;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Common configuration for the {@link MarkdownDocumentReader}.
+ *
+ * @author Piotr Olaszewski
+ */
+public class MarkdownDocumentReaderConfig {
+
+	public final boolean horizontalRuleCreateDocument;
+
+	public final boolean includeCodeBlock;
+
+	public final boolean includeBlockquote;
+
+	public final Map<String, Object> additionalMetadata;
+
+	public MarkdownDocumentReaderConfig(Builder builder) {
+		horizontalRuleCreateDocument = builder.horizontalRuleCreateDocument;
+		includeCodeBlock = builder.includeCodeBlock;
+		includeBlockquote = builder.includeBlockquote;
+		additionalMetadata = builder.additionalMetadata;
+	}
+
+	/**
+	 * @return the default configuration
+	 */
+	public static MarkdownDocumentReaderConfig defaultConfig() {
+		return builder().build();
+	}
+
+	public static Builder builder() {
+		return new Builder();
+	}
+
+	public static class Builder {
+
+		private boolean horizontalRuleCreateDocument = false;
+
+		private boolean includeCodeBlock = false;
+
+		private boolean includeBlockquote = false;
+
+		private Map<String, Object> additionalMetadata = new HashMap<>();
+
+		private Builder() {
+		}
+
+		/**
+		 * Text divided by horizontal lines will create new {@link Document}s. The default
+		 * is {@code false}, meaning text separated by horizontal lines won't create a new
+		 * document.
+		 * @param horizontalRuleCreateDocument flag to determine whether new documents are
+		 * created from text divided by horizontal line
+		 * @return this builder
+		 */
+		public Builder withHorizontalRuleCreateDocument(boolean horizontalRuleCreateDocument) {
+			this.horizontalRuleCreateDocument = horizontalRuleCreateDocument;
+			return this;
+		}
+
+		/**
+		 * Whatever to include code blocks in {@link Document}s. The default is
+		 * {@code false}, which means all code blocks are in separate documents.
+		 * @param includeCodeBlock flag to include code block into paragraph document or
+		 * create new with code only
+		 * @return this builder
+		 */
+		public Builder withIncludeCodeBlock(boolean includeCodeBlock) {
+			this.includeCodeBlock = includeCodeBlock;
+			return this;
+		}
+
+		/**
+		 * Whatever to include blockquotes in {@link Document}s. The default is
+		 * {@code false}, which means all blockquotes are in separate documents.
+		 * @param includeBlockquote flag to include blockquotes into paragraph document or
+		 * create new with blockquote only
+		 * @return this builder
+		 */
+		public Builder withIncludeBlockquote(boolean includeBlockquote) {
+			this.includeBlockquote = includeBlockquote;
+			return this;
+		}
+
+		/**
+		 * Adds this additional metadata to the all built {@link Document}s.
+		 * @return this builder
+		 */
+		public Builder withAdditionalMetadata(String key, Object value) {
+			Assert.notNull(key, "key must not be null");
+			Assert.notNull(value, "value must not be null");
+			this.additionalMetadata.put(key, value);
+			return this;
+		}
+
+		/**
+		 * Adds this additional metadata to the all built {@link Document}s.
+		 * @return this builder
+		 */
+		public Builder withAdditionalMetadata(Map<String, Object> additionalMetadata) {
+			Assert.notNull(additionalMetadata, "additionalMetadata must not be null");
+			this.additionalMetadata = additionalMetadata;
+			return this;
+		}
+
+		/**
+		 * @return the immutable configuration
+		 */
+		public MarkdownDocumentReaderConfig build() {
+			return new MarkdownDocumentReaderConfig(this);
+		}
+
+	}
+
+}