Skip to content

Commit

Permalink
feat: nltk text splitter support
Browse files Browse the repository at this point in the history
  • Loading branch information
uladkaminski committed Aug 16, 2024
1 parent a5cdab6 commit f48c0ea
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from typing import Any

from langchain_text_splitters import NLTKTextSplitter, TextSplitter

from langflow.base.textsplitters.model import LCTextSplitterComponent
from langflow.inputs import DataInput, IntInput, MessageTextInput
from langflow.utils.util import unescape_string


class NaturalLanguageTextSplitterComponent(LCTextSplitterComponent):
display_name = "Natural Language Text Splitter"
description = "Split text based on natural language boundaries, optimized for a specified language."
name = "NaturalLanguageTextSplitter"

inputs = [
IntInput(
name="chunk_size",
display_name="Chunk Size",
info="The maximum number of characters in each chunk after splitting.",
value=1000,
),
IntInput(
name="chunk_overlap",
display_name="Chunk Overlap",
info="The number of characters that overlap between consecutive chunks.",
value=200,
),
DataInput(
name="data_input",
display_name="Input",
info="The text data to be split.",
input_types=["Document", "Data"],
),
MessageTextInput(
name="separator",
display_name="Separator",
info='The character(s) to use as a delimiter when splitting text.\nDefaults to "\\n\\n" if left empty.',
),
MessageTextInput(
name="language",
display_name="Language",
info='The language of the text. Default is "English". Supports multiple languages for better text boundary recognition.',
),
]

def get_data_input(self) -> Any:
return self.data_input

def build_text_splitter(self) -> TextSplitter:
if self.separator:
separator = unescape_string(self.separator)
else:
separator = "\n\n"
return NLTKTextSplitter(
language=self.language.lower() if self.language else "english",
separator=separator,
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap,
)
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from .CharacterTextSplitter import CharacterTextSplitterComponent
from .LanguageRecursiveTextSplitter import LanguageRecursiveTextSplitterComponent
from .RecursiveCharacterTextSplitter import RecursiveCharacterTextSplitterComponent
from .NaturalLanguageTextSplitter import NaturalLanguageTextSplitterComponent as Na

Check failure on line 4 in src/backend/base/langflow/components/textsplitters/__init__.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.12)

Ruff (F401)

src/backend/base/langflow/components/textsplitters/__init__.py:4:82: F401 `.NaturalLanguageTextSplitter.NaturalLanguageTextSplitterComponent` imported but unused; consider removing, adding to `__all__`, or using a redundant alias

__all__ = [
"CharacterTextSplitterComponent",
"LanguageRecursiveTextSplitterComponent",
"RecursiveCharacterTextSplitterComponent",
"NaturalLanguageTextSplitterComponent",
]

0 comments on commit f48c0ea

Please sign in to comment.