forked from elastic/elasticsearch
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[ML] Sentence Chunker (elastic#110334)
The Sentence chunker splits long text into smaller chunks on sentence boundaries.
- Loading branch information
Showing
15 changed files
with
384 additions
and
27 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
pr: 110334 | ||
summary: Sentence Chunker | ||
area: Machine Learning | ||
type: enhancement | ||
issues: [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
138 changes: 138 additions & 0 deletions
138
...nce/src/main/java/org/elasticsearch/xpack/inference/chunking/SentenceBoundaryChunker.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
/* | ||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
* or more contributor license agreements. Licensed under the Elastic License | ||
* 2.0; you may not use this file except in compliance with the Elastic License | ||
* 2.0. | ||
*/ | ||
|
||
package org.elasticsearch.xpack.inference.chunking; | ||
|
||
import com.ibm.icu.text.BreakIterator; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.Locale; | ||
|
||
/** | ||
* Split text into chunks aligned on sentence boundaries. | ||
* The maximum chunk size is measured in words and controlled | ||
* by {@code maxNumberWordsPerChunk}. Sentences are combined | ||
* greedily until adding the next sentence would exceed | ||
* {@code maxNumberWordsPerChunk}, at which point a new chunk | ||
* is created. If an individual sentence is longer than | ||
* {@code maxNumberWordsPerChunk} it is split on word boundary with | ||
* overlap. | ||
*/ | ||
public class SentenceBoundaryChunker { | ||
|
||
private final BreakIterator sentenceIterator; | ||
private final BreakIterator wordIterator; | ||
|
||
public SentenceBoundaryChunker() { | ||
sentenceIterator = BreakIterator.getSentenceInstance(Locale.ROOT); | ||
wordIterator = BreakIterator.getWordInstance(Locale.ROOT); | ||
} | ||
|
||
/** | ||
* Break the input text into small chunks on sentence boundaries. | ||
* | ||
* @param input Text to chunk | ||
* @param maxNumberWordsPerChunk Maximum size of the chunk | ||
* @return The input text chunked | ||
*/ | ||
public List<String> chunk(String input, int maxNumberWordsPerChunk) { | ||
var chunks = new ArrayList<String>(); | ||
|
||
sentenceIterator.setText(input); | ||
wordIterator.setText(input); | ||
|
||
int chunkStart = 0; | ||
int chunkEnd = 0; | ||
int sentenceStart = 0; | ||
int chunkWordCount = 0; | ||
|
||
int boundary = sentenceIterator.next(); | ||
|
||
while (boundary != BreakIterator.DONE) { | ||
int sentenceEnd = sentenceIterator.current(); | ||
int countWordsInSentence = countWords(sentenceStart, sentenceEnd); | ||
|
||
if (chunkWordCount + countWordsInSentence > maxNumberWordsPerChunk) { | ||
// over the max chunk size, roll back to the last sentence | ||
|
||
if (chunkWordCount > 0) { | ||
// add a new chunk containing all the input up to this sentence | ||
chunks.add(input.substring(chunkStart, chunkEnd)); | ||
chunkStart = chunkEnd; | ||
chunkWordCount = countWordsInSentence; // the next chunk will contain this sentence | ||
} | ||
|
||
if (countWordsInSentence > maxNumberWordsPerChunk) { | ||
// This sentence is bigger than the max chunk size. | ||
// Split the sentence on the word boundary | ||
var sentenceSplits = splitLongSentence( | ||
input.substring(chunkStart, sentenceEnd), | ||
maxNumberWordsPerChunk, | ||
overlapForChunkSize(maxNumberWordsPerChunk) | ||
); | ||
|
||
int i = 0; | ||
for (; i < sentenceSplits.size() - 1; i++) { | ||
// Because the substring was passed to splitLongSentence() | ||
// the returned positions need to be offset by chunkStart | ||
chunks.add(input.substring(chunkStart + sentenceSplits.get(i).start(), chunkStart + sentenceSplits.get(i).end())); | ||
} | ||
// The final split is partially filled. | ||
// Set the next chunk start to the beginning of the | ||
// final split of the long sentence. | ||
chunkStart = chunkStart + sentenceSplits.get(i).start(); // start pos needs to be offset by chunkStart | ||
chunkWordCount = sentenceSplits.get(i).wordCount(); | ||
} | ||
} else { | ||
chunkWordCount += countWordsInSentence; | ||
} | ||
|
||
sentenceStart = sentenceEnd; | ||
chunkEnd = sentenceEnd; | ||
|
||
boundary = sentenceIterator.next(); | ||
} | ||
|
||
if (chunkWordCount > 0) { | ||
chunks.add(input.substring(chunkStart)); | ||
} | ||
|
||
return chunks; | ||
} | ||
|
||
static List<WordBoundaryChunker.ChunkPosition> splitLongSentence(String text, int maxNumberOfWords, int overlap) { | ||
return new WordBoundaryChunker().chunkPositions(text, maxNumberOfWords, overlap); | ||
} | ||
|
||
private int countWords(int start, int end) { | ||
return countWords(start, end, this.wordIterator); | ||
} | ||
|
||
// Exposed for testing. wordIterator should have had | ||
// setText() applied before using this function. | ||
static int countWords(int start, int end, BreakIterator wordIterator) { | ||
assert start < end; | ||
wordIterator.preceding(start); // start of the current word | ||
|
||
int boundary = wordIterator.current(); | ||
int wordCount = 0; | ||
while (boundary != BreakIterator.DONE && boundary <= end) { | ||
int wordStatus = wordIterator.getRuleStatus(); | ||
if (wordStatus != BreakIterator.WORD_NONE) { | ||
wordCount++; | ||
} | ||
boundary = wordIterator.next(); | ||
} | ||
|
||
return wordCount; | ||
} | ||
|
||
private static int overlapForChunkSize(int chunkSize) { | ||
return (chunkSize - 1) / 2; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.