Refactor test structures and add token filtering

The 'TokenizerTest' method names are updated for a more concise explanation of the tests. Additionally, a new test case for token filtering rule was introduced. All the tests are organized using 'describe' and 'test' blocks instead of just 'it' for better readability and structure.
phonyland · Jan 2, 2024 · 2ad391a · 2ad391a
1 parent 9dccd04
commit 2ad391a
Showing 1 changed file with 109 additions and 103 deletions.
diff --git a/tests/TokenizerTest.php b/tests/TokenizerTest.php
@@ -5,132 +5,138 @@
 use Phonyland\NGram\Tokenizer;
 use Phonyland\NGram\TokenizerFilterType;
 
-it('Tokenizer@tokenize: Seperates the text with the given separator', function (): void {
-    // Arrange
-    $tokenizer = new Tokenizer();
-    $tokenizer->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR);
+describe('Tokenizer@tokenize', function () {
+    test('it can separate the text from the given separator', function (): void {
+        // Arrange
+        $tokenizer = new Tokenizer();
+        $tokenizer->addWordSeparatorPattern(wordSeparationPattern: TokenizerFilterType::WHITESPACE_SEPARATOR);
 
-    $text = 'sample text';
+        $text = 'sample text';
 
-    // Act
-    $result = $tokenizer->tokenize($text);
+        // Act
+        $result = $tokenizer->tokenize($text);
 
-    // Assert
-    expect($result)->toBe(['sample', 'text']);
-});
+        // Assert
+        expect($result)->toBe(['sample', 'text']);
+    });
 
-it('Tokenizer@tokenize: Seperates the text with multiple separators', function (): void {
-    // Arrange
-    $tokenizer = new Tokenizer();
-    $tokenizer
-        ->addWordSeparatorPattern(' ')
-        ->addWordSeparatorPattern(';');
+    test('it can separate the text from multiple separators', function (): void {
+        // Arrange
+        $tokenizer = new Tokenizer();
+        $tokenizer
+            ->addWordSeparatorPattern(' ')
+            ->addWordSeparatorPattern(';');
 
-    $text = 'sample text;sample;text';
+        $text = 'sample text;sample;text';
 
-    // Act
-    $result = $tokenizer->tokenize($text);
+        // Act
+        $result = $tokenizer->tokenize($text);
 
-    // Assert
-    expect($result)->toBe(['sample', 'text', 'sample', 'text']);
-});
+        // Assert
+        expect($result)->toBe(['sample', 'text', 'sample', 'text']);
+    });
 
-it('Tokenizer@tokenize: Seperates the text with regex patterns', function (): void {
-    // Arrange
-    $tokenizer = new Tokenizer();
-    $tokenizer->addWordSeparatorPattern('\s');
+    test('it can separate the text from regex patterns', function (): void {
+        // Arrange
+        $tokenizer = new Tokenizer();
+        $tokenizer->addWordSeparatorPattern('\s');
 
-    $text = 'sample     text '.PHP_EOL.'sample text';
+        $text = 'sample     text '.PHP_EOL.'sample text';
 
-    // Act
-    $result = $tokenizer->tokenize($text);
+        // Act
+        $result = $tokenizer->tokenize($text);
 
-    // Assert
-    expect($result)->toBe(['sample', 'text', 'sample', 'text']);
-});
+        // Assert
+        expect($result)->toBe(['sample', 'text', 'sample', 'text']);
+    });
 
-it('Tokenizer@tokenize: A minimum word length can be set', function (): void {
-    // Arrange
-    $tokenizer = new Tokenizer();
-    $tokenizer->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR);
+    test('A minimum word length can be set', function (): void {
+        // Arrange
+        $tokenizer = new Tokenizer();
+        $tokenizer->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR);
 
-    $text = 'A sample text with a some meaningless words';
+        $text = 'A sample text with some meaningless words';
 
-    // Act
-    $result = $tokenizer->tokenize($text, 6);
+        // Act
+        $result = $tokenizer->tokenize(text: $text, minWordLength: 6);
 
-    // Assert
-    expect($result)->toBe(['sample', 'meaningless']);
-});
+        // Assert
+        expect($result)->toBe(['sample', 'meaningless']);
+    });
 
-it('Tokenizer@sentences: Separates the text with the given punctuation into sentences', function (): void {
-    // Act
-    $tokenizer = new Tokenizer();
-    $tokenizer
-        ->addSentenceSeparatorPattern('.')
-        ->addSentenceSeparatorPattern('!')
-        ->addSentenceSeparatorPattern('?');
-
-    $text = 'Sample Sentence. Sample Sentence! Sample Sentence? Sample Sentence no. 4?! Sample sample sentence... End';
+    it('it can filter the tokens by given removal rule', function (): void {
+        // Arrange
+        $tokenizer = (new Tokenizer())
+            ->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR);
 
-    // Act
-    $result = $tokenizer->sentences($text);
+        // Act I
+        $tokenizer->addWordFilterRule('/m/');
+        // Assert I
+        expect($tokenizer->tokenize('sample text'))->toBe(['saple', 'text']);
 
-    // Assert
-    expect($result)->toBe([
-        'Sample Sentence.',
-        'Sample Sentence!',
-        'Sample Sentence?',
-        'Sample Sentence no.',
-        '4?!',
-        'Sample sample sentence...',
-        'End',
-    ]);
+        // Act II
+        $tokenizer->addWordFilterRule('/x/', 'q');
+        // Assert II
+        expect($tokenizer->tokenize('sample text'))->toBe(['saple', 'teqt']);
+    });
 });
 
-it('Tokenizer@tokenizeBySentences: Separates the text into tokens by sentences', function (): void {
-    // Arrange
-    $tokenizer = new Tokenizer();
-    $tokenizer
-        ->addSentenceSeparatorPattern('.')
-        ->addSentenceSeparatorPattern('!')
-        ->addSentenceSeparatorPattern('?')
-        ->addWordFilterRule(TokenizerFilterType::NO_SYMBOLS)
-        ->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR);
-
-    $text = 'Sample Sentence. Sample Sentence! Sample Sentence? Sample Sentence no. 4?! Sample sample sentence... End';
-
-    // Act
-    $result = $tokenizer->tokenizeBySentences($text);
-
-    // Assert
-    expect($result)->toBe([
-        ['Sample', 'Sentence'],
-        ['Sample', 'Sentence'],
-        ['Sample', 'Sentence'],
-        ['Sample', 'Sentence', 'no'],
-        ['Sample', 'sample', 'sentence'],
-        ['End'],
-    ]);
+describe('Tokenizer@sentences', function () {
+    test('it can separate the text with the given punctuation into sentences', function (): void {
+        // Act
+        $tokenizer = new Tokenizer();
+        $tokenizer
+            ->addSentenceSeparatorPattern('.')
+            ->addSentenceSeparatorPattern('!')
+            ->addSentenceSeparatorPattern('?');
+
+        $text = 'Sample Sentence. Sample Sentence! Sample Sentence? Sample Sentence no. 4?! Sample sample sentence... End';
+
+        // Act
+        $result = $tokenizer->sentences($text);
+
+        // Assert
+        expect($result)->toBe([
+            'Sample Sentence.',
+            'Sample Sentence!',
+            'Sample Sentence?',
+            'Sample Sentence no.',
+            '4?!',
+            'Sample sample sentence...',
+            'End',
+        ]);
+    });
 });
 
-it('Tokenizer@tokenize: Filters the tokens by given removal rule', function (): void {
-    // Arrange
-    $tokenizer = (new Tokenizer())
-        ->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR);
-
-    // Act I
-    $tokenizer->addWordFilterRule('/m/');
-    // Assert I
-    expect($tokenizer->tokenize('sample text'))->toBe(['saple', 'text']);
-
-    // Act II
-    $tokenizer->addWordFilterRule('/x/', 'q');
-    // Assert II
-    expect($tokenizer->tokenize('sample text'))->toBe(['saple', 'teqt']);
+describe('Tokenizer@tokenizeBySentences', function () {
+    test('it can separate the text into tokens by sentences', function (): void {
+        // Arrange
+        $tokenizer = new Tokenizer();
+        $tokenizer
+            ->addSentenceSeparatorPattern('.')
+            ->addSentenceSeparatorPattern('!')
+            ->addSentenceSeparatorPattern('?')
+            ->addWordFilterRule(TokenizerFilterType::NO_SYMBOLS)
+            ->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR);
+
+        $text = 'Sample Sentence. Sample Sentence! Sample Sentence? Sample Sentence no. 4?! Sample sample sentence... End';
+
+        // Act
+        $result = $tokenizer->tokenizeBySentences($text);
+
+        // Assert
+        expect($result)->toBe([
+            ['Sample', 'Sentence'],
+            ['Sample', 'Sentence'],
+            ['Sample', 'Sentence'],
+            ['Sample', 'Sentence', 'no'],
+            ['Sample', 'sample', 'sentence'],
+            ['End'],
+        ]);
+    });
 });
 
-it('Tokenizer: Can convert tokens to lowercase', function (): void {
+test('it can convert tokens to lowercase', function (): void {
     // Arrange
     $tokenizer = (new Tokenizer())
         ->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR)
@@ -145,7 +151,7 @@
     expect($result)->toBe(['sample', 'text']);
 });
 
-test('Tokenizer: Can be converted to an array', function (): void {
+test('it can convert tokens to an array', function (): void {
     // Arrange
     $tokenizer = (new Tokenizer())
         ->addWordFilterRule(TokenizerFilterType::NO_SYMBOLS)