Skip to content

Commit

Permalink
Merge pull request #10 from kortemik/partial-token
Browse files Browse the repository at this point in the history
Partial token
  • Loading branch information
kortemik authored Oct 11, 2023
2 parents 058b6a8 + f4f15f4 commit 4a2db47
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 2 deletions.
14 changes: 12 additions & 2 deletions src/main/java/com/teragrep/blf_01/Tokenizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,12 @@ public class Tokenizer {
final Entanglement entanglement;
final TokenScan majorTokenScan;
final TokenScan minorTokenScan;
final long maxTokenCount;

public Tokenizer() {
this(Long.MAX_VALUE);
}
public Tokenizer(long maxTokenCount) {

final MajorDelimiters majorDelimiters = new MajorDelimiters();
final MinorDelimiters minorDelimiters = new MinorDelimiters();
Expand All @@ -69,7 +73,7 @@ public Tokenizer() {
this.entanglement = new Entanglement();
this.majorTokenScan = new TokenScan(majorDelimiters);
this.minorTokenScan = new TokenScan(minorDelimiters);

this.maxTokenCount = maxTokenCount;
}

/**
Expand All @@ -94,8 +98,14 @@ public List<Token> tokenize(InputStream is) {

ArrayList<Token> minorTokens = minorTokenScan.findBy(stream);

ArrayList<Token> tokens;
if (minorTokens.size() > maxTokenCount) {
tokens = minorTokens;
} else {
tokens = entanglement.entangle(minorTokens);
}

allTokens.addAll(entanglement.entangle(minorTokens));
allTokens.addAll(tokens);
}

return allTokens;
Expand Down
20 changes: 20 additions & 0 deletions src/test/java/com/teragrep/blf_01/TokenizerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,26 @@ public void testTokenization() {

}


@Test
public void testTokenizerSizeLimit() {
Tokenizer tokenizer = new Tokenizer(3);
String input = "Abc#####Xyz";
ByteArrayInputStream bais = new ByteArrayInputStream(input.getBytes(StandardCharsets.UTF_8));
List<Token> result = tokenizer.tokenize(bais);

List<String> expected =
Arrays.asList(
"Abc#####Xyz", "Abc", "#", "#", "#", "#", "#", "Xyz"
);

assertTrue(result.stream()
.map(Token::toString)
.collect(Collectors.toList())
.containsAll(expected));

}

@Test
@Benchmark
public void tokenizeFileInput() throws FileNotFoundException {
Expand Down

0 comments on commit 4a2db47

Please sign in to comment.