diff --git a/tests/TokenizerTest.php b/tests/TokenizerTest.php index 49f74ba..fbb7747 100644 --- a/tests/TokenizerTest.php +++ b/tests/TokenizerTest.php @@ -5,132 +5,138 @@ use Phonyland\NGram\Tokenizer; use Phonyland\NGram\TokenizerFilterType; -it('Tokenizer@tokenize: Seperates the text with the given separator', function (): void { - // Arrange - $tokenizer = new Tokenizer(); - $tokenizer->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR); +describe('Tokenizer@tokenize', function () { + test('it can separate the text from the given separator', function (): void { + // Arrange + $tokenizer = new Tokenizer(); + $tokenizer->addWordSeparatorPattern(wordSeparationPattern: TokenizerFilterType::WHITESPACE_SEPARATOR); - $text = 'sample text'; + $text = 'sample text'; - // Act - $result = $tokenizer->tokenize($text); + // Act + $result = $tokenizer->tokenize($text); - // Assert - expect($result)->toBe(['sample', 'text']); -}); + // Assert + expect($result)->toBe(['sample', 'text']); + }); -it('Tokenizer@tokenize: Seperates the text with multiple separators', function (): void { - // Arrange - $tokenizer = new Tokenizer(); - $tokenizer - ->addWordSeparatorPattern(' ') - ->addWordSeparatorPattern(';'); + test('it can separate the text from multiple separators', function (): void { + // Arrange + $tokenizer = new Tokenizer(); + $tokenizer + ->addWordSeparatorPattern(' ') + ->addWordSeparatorPattern(';'); - $text = 'sample text;sample;text'; + $text = 'sample text;sample;text'; - // Act - $result = $tokenizer->tokenize($text); + // Act + $result = $tokenizer->tokenize($text); - // Assert - expect($result)->toBe(['sample', 'text', 'sample', 'text']); -}); + // Assert + expect($result)->toBe(['sample', 'text', 'sample', 'text']); + }); -it('Tokenizer@tokenize: Seperates the text with regex patterns', function (): void { - // Arrange - $tokenizer = new Tokenizer(); - $tokenizer->addWordSeparatorPattern('\s'); + test('it can separate the text from regex patterns', function (): void { + // Arrange + $tokenizer = new Tokenizer(); + $tokenizer->addWordSeparatorPattern('\s'); - $text = 'sample text '.PHP_EOL.'sample text'; + $text = 'sample text '.PHP_EOL.'sample text'; - // Act - $result = $tokenizer->tokenize($text); + // Act + $result = $tokenizer->tokenize($text); - // Assert - expect($result)->toBe(['sample', 'text', 'sample', 'text']); -}); + // Assert + expect($result)->toBe(['sample', 'text', 'sample', 'text']); + }); -it('Tokenizer@tokenize: A minimum word length can be set', function (): void { - // Arrange - $tokenizer = new Tokenizer(); - $tokenizer->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR); + test('A minimum word length can be set', function (): void { + // Arrange + $tokenizer = new Tokenizer(); + $tokenizer->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR); - $text = 'A sample text with a some meaningless words'; + $text = 'A sample text with some meaningless words'; - // Act - $result = $tokenizer->tokenize($text, 6); + // Act + $result = $tokenizer->tokenize(text: $text, minWordLength: 6); - // Assert - expect($result)->toBe(['sample', 'meaningless']); -}); + // Assert + expect($result)->toBe(['sample', 'meaningless']); + }); -it('Tokenizer@sentences: Separates the text with the given punctuation into sentences', function (): void { - // Act - $tokenizer = new Tokenizer(); - $tokenizer - ->addSentenceSeparatorPattern('.') - ->addSentenceSeparatorPattern('!') - ->addSentenceSeparatorPattern('?'); - - $text = 'Sample Sentence. Sample Sentence! Sample Sentence? Sample Sentence no. 4?! Sample sample sentence... End'; + it('it can filter the tokens by given removal rule', function (): void { + // Arrange + $tokenizer = (new Tokenizer()) + ->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR); - // Act - $result = $tokenizer->sentences($text); + // Act I + $tokenizer->addWordFilterRule('/m/'); + // Assert I + expect($tokenizer->tokenize('sample text'))->toBe(['saple', 'text']); - // Assert - expect($result)->toBe([ - 'Sample Sentence.', - 'Sample Sentence!', - 'Sample Sentence?', - 'Sample Sentence no.', - '4?!', - 'Sample sample sentence...', - 'End', - ]); + // Act II + $tokenizer->addWordFilterRule('/x/', 'q'); + // Assert II + expect($tokenizer->tokenize('sample text'))->toBe(['saple', 'teqt']); + }); }); -it('Tokenizer@tokenizeBySentences: Separates the text into tokens by sentences', function (): void { - // Arrange - $tokenizer = new Tokenizer(); - $tokenizer - ->addSentenceSeparatorPattern('.') - ->addSentenceSeparatorPattern('!') - ->addSentenceSeparatorPattern('?') - ->addWordFilterRule(TokenizerFilterType::NO_SYMBOLS) - ->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR); - - $text = 'Sample Sentence. Sample Sentence! Sample Sentence? Sample Sentence no. 4?! Sample sample sentence... End'; - - // Act - $result = $tokenizer->tokenizeBySentences($text); - - // Assert - expect($result)->toBe([ - ['Sample', 'Sentence'], - ['Sample', 'Sentence'], - ['Sample', 'Sentence'], - ['Sample', 'Sentence', 'no'], - ['Sample', 'sample', 'sentence'], - ['End'], - ]); +describe('Tokenizer@sentences', function () { + test('it can separate the text with the given punctuation into sentences', function (): void { + // Act + $tokenizer = new Tokenizer(); + $tokenizer + ->addSentenceSeparatorPattern('.') + ->addSentenceSeparatorPattern('!') + ->addSentenceSeparatorPattern('?'); + + $text = 'Sample Sentence. Sample Sentence! Sample Sentence? Sample Sentence no. 4?! Sample sample sentence... End'; + + // Act + $result = $tokenizer->sentences($text); + + // Assert + expect($result)->toBe([ + 'Sample Sentence.', + 'Sample Sentence!', + 'Sample Sentence?', + 'Sample Sentence no.', + '4?!', + 'Sample sample sentence...', + 'End', + ]); + }); }); -it('Tokenizer@tokenize: Filters the tokens by given removal rule', function (): void { - // Arrange - $tokenizer = (new Tokenizer()) - ->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR); - - // Act I - $tokenizer->addWordFilterRule('/m/'); - // Assert I - expect($tokenizer->tokenize('sample text'))->toBe(['saple', 'text']); - - // Act II - $tokenizer->addWordFilterRule('/x/', 'q'); - // Assert II - expect($tokenizer->tokenize('sample text'))->toBe(['saple', 'teqt']); +describe('Tokenizer@tokenizeBySentences', function () { + test('it can separate the text into tokens by sentences', function (): void { + // Arrange + $tokenizer = new Tokenizer(); + $tokenizer + ->addSentenceSeparatorPattern('.') + ->addSentenceSeparatorPattern('!') + ->addSentenceSeparatorPattern('?') + ->addWordFilterRule(TokenizerFilterType::NO_SYMBOLS) + ->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR); + + $text = 'Sample Sentence. Sample Sentence! Sample Sentence? Sample Sentence no. 4?! Sample sample sentence... End'; + + // Act + $result = $tokenizer->tokenizeBySentences($text); + + // Assert + expect($result)->toBe([ + ['Sample', 'Sentence'], + ['Sample', 'Sentence'], + ['Sample', 'Sentence'], + ['Sample', 'Sentence', 'no'], + ['Sample', 'sample', 'sentence'], + ['End'], + ]); + }); }); -it('Tokenizer: Can convert tokens to lowercase', function (): void { +test('it can convert tokens to lowercase', function (): void { // Arrange $tokenizer = (new Tokenizer()) ->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR) @@ -145,7 +151,7 @@ expect($result)->toBe(['sample', 'text']); }); -test('Tokenizer: Can be converted to an array', function (): void { +test('it can convert tokens to an array', function (): void { // Arrange $tokenizer = (new Tokenizer()) ->addWordFilterRule(TokenizerFilterType::NO_SYMBOLS)