From f48c0eadf43f36344b36d852e4f48c6e0c177d7a Mon Sep 17 00:00:00 2001 From: Uladzislau Kaminski Date: Sat, 17 Aug 2024 00:21:50 +0100 Subject: [PATCH] feat: nltk text splitter support --- .../NaturalLanguageTextSplitter.py | 59 +++++++++++++++++++ .../components/textsplitters/__init__.py | 2 + 2 files changed, 61 insertions(+) create mode 100644 src/backend/base/langflow/components/textsplitters/NaturalLanguageTextSplitter.py diff --git a/src/backend/base/langflow/components/textsplitters/NaturalLanguageTextSplitter.py b/src/backend/base/langflow/components/textsplitters/NaturalLanguageTextSplitter.py new file mode 100644 index 000000000000..8f3de2ec2360 --- /dev/null +++ b/src/backend/base/langflow/components/textsplitters/NaturalLanguageTextSplitter.py @@ -0,0 +1,59 @@ +from typing import Any + +from langchain_text_splitters import NLTKTextSplitter, TextSplitter + +from langflow.base.textsplitters.model import LCTextSplitterComponent +from langflow.inputs import DataInput, IntInput, MessageTextInput +from langflow.utils.util import unescape_string + + +class NaturalLanguageTextSplitterComponent(LCTextSplitterComponent): + display_name = "Natural Language Text Splitter" + description = "Split text based on natural language boundaries, optimized for a specified language." + name = "NaturalLanguageTextSplitter" + + inputs = [ + IntInput( + name="chunk_size", + display_name="Chunk Size", + info="The maximum number of characters in each chunk after splitting.", + value=1000, + ), + IntInput( + name="chunk_overlap", + display_name="Chunk Overlap", + info="The number of characters that overlap between consecutive chunks.", + value=200, + ), + DataInput( + name="data_input", + display_name="Input", + info="The text data to be split.", + input_types=["Document", "Data"], + ), + MessageTextInput( + name="separator", + display_name="Separator", + info='The character(s) to use as a delimiter when splitting text.\nDefaults to "\\n\\n" if left empty.', + ), + MessageTextInput( + name="language", + display_name="Language", + info='The language of the text. Default is "English". Supports multiple languages for better text boundary recognition.', + ), + ] + + def get_data_input(self) -> Any: + return self.data_input + + def build_text_splitter(self) -> TextSplitter: + if self.separator: + separator = unescape_string(self.separator) + else: + separator = "\n\n" + return NLTKTextSplitter( + language=self.language.lower() if self.language else "english", + separator=separator, + chunk_size=self.chunk_size, + chunk_overlap=self.chunk_overlap, + ) diff --git a/src/backend/base/langflow/components/textsplitters/__init__.py b/src/backend/base/langflow/components/textsplitters/__init__.py index eb2d6af13206..0d1847e8a5f4 100644 --- a/src/backend/base/langflow/components/textsplitters/__init__.py +++ b/src/backend/base/langflow/components/textsplitters/__init__.py @@ -1,9 +1,11 @@ from .CharacterTextSplitter import CharacterTextSplitterComponent from .LanguageRecursiveTextSplitter import LanguageRecursiveTextSplitterComponent from .RecursiveCharacterTextSplitter import RecursiveCharacterTextSplitterComponent +from .NaturalLanguageTextSplitter import NaturalLanguageTextSplitterComponent as Na __all__ = [ "CharacterTextSplitterComponent", "LanguageRecursiveTextSplitterComponent", "RecursiveCharacterTextSplitterComponent", + "NaturalLanguageTextSplitterComponent", ]