-
Notifications
You must be signed in to change notification settings - Fork 5.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
a5cdab6
commit f48c0ea
Showing
2 changed files
with
61 additions
and
0 deletions.
There are no files selected for viewing
59 changes: 59 additions & 0 deletions
59
src/backend/base/langflow/components/textsplitters/NaturalLanguageTextSplitter.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
from typing import Any | ||
|
||
from langchain_text_splitters import NLTKTextSplitter, TextSplitter | ||
|
||
from langflow.base.textsplitters.model import LCTextSplitterComponent | ||
from langflow.inputs import DataInput, IntInput, MessageTextInput | ||
from langflow.utils.util import unescape_string | ||
|
||
|
||
class NaturalLanguageTextSplitterComponent(LCTextSplitterComponent): | ||
display_name = "Natural Language Text Splitter" | ||
description = "Split text based on natural language boundaries, optimized for a specified language." | ||
name = "NaturalLanguageTextSplitter" | ||
|
||
inputs = [ | ||
IntInput( | ||
name="chunk_size", | ||
display_name="Chunk Size", | ||
info="The maximum number of characters in each chunk after splitting.", | ||
value=1000, | ||
), | ||
IntInput( | ||
name="chunk_overlap", | ||
display_name="Chunk Overlap", | ||
info="The number of characters that overlap between consecutive chunks.", | ||
value=200, | ||
), | ||
DataInput( | ||
name="data_input", | ||
display_name="Input", | ||
info="The text data to be split.", | ||
input_types=["Document", "Data"], | ||
), | ||
MessageTextInput( | ||
name="separator", | ||
display_name="Separator", | ||
info='The character(s) to use as a delimiter when splitting text.\nDefaults to "\\n\\n" if left empty.', | ||
), | ||
MessageTextInput( | ||
name="language", | ||
display_name="Language", | ||
info='The language of the text. Default is "English". Supports multiple languages for better text boundary recognition.', | ||
), | ||
] | ||
|
||
def get_data_input(self) -> Any: | ||
return self.data_input | ||
|
||
def build_text_splitter(self) -> TextSplitter: | ||
if self.separator: | ||
separator = unescape_string(self.separator) | ||
else: | ||
separator = "\n\n" | ||
return NLTKTextSplitter( | ||
language=self.language.lower() if self.language else "english", | ||
separator=separator, | ||
chunk_size=self.chunk_size, | ||
chunk_overlap=self.chunk_overlap, | ||
) |
2 changes: 2 additions & 0 deletions
2
src/backend/base/langflow/components/textsplitters/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,11 @@ | ||
from .CharacterTextSplitter import CharacterTextSplitterComponent | ||
from .LanguageRecursiveTextSplitter import LanguageRecursiveTextSplitterComponent | ||
from .RecursiveCharacterTextSplitter import RecursiveCharacterTextSplitterComponent | ||
from .NaturalLanguageTextSplitter import NaturalLanguageTextSplitterComponent as Na | ||
Check failure on line 4 in src/backend/base/langflow/components/textsplitters/__init__.py
|
||
|
||
__all__ = [ | ||
"CharacterTextSplitterComponent", | ||
"LanguageRecursiveTextSplitterComponent", | ||
"RecursiveCharacterTextSplitterComponent", | ||
"NaturalLanguageTextSplitterComponent", | ||
] |