Skip to content

Commit

Permalink
Refactor test structures and add token filtering
Browse files Browse the repository at this point in the history
The 'TokenizerTest' method names are updated for a more concise explanation of the tests. Additionally, a new test case for token filtering rule was introduced. All the tests are organized using 'describe' and 'test' blocks instead of just 'it' for better readability and structure.
  • Loading branch information
deligoez committed Jan 2, 2024
1 parent 9dccd04 commit 2ad391a
Showing 1 changed file with 109 additions and 103 deletions.
212 changes: 109 additions & 103 deletions tests/TokenizerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,132 +5,138 @@
use Phonyland\NGram\Tokenizer;
use Phonyland\NGram\TokenizerFilterType;

it('Tokenizer@tokenize: Seperates the text with the given separator', function (): void {
// Arrange
$tokenizer = new Tokenizer();
$tokenizer->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR);
describe('Tokenizer@tokenize', function () {
test('it can separate the text from the given separator', function (): void {
// Arrange
$tokenizer = new Tokenizer();
$tokenizer->addWordSeparatorPattern(wordSeparationPattern: TokenizerFilterType::WHITESPACE_SEPARATOR);

$text = 'sample text';
$text = 'sample text';

// Act
$result = $tokenizer->tokenize($text);
// Act
$result = $tokenizer->tokenize($text);

// Assert
expect($result)->toBe(['sample', 'text']);
});
// Assert
expect($result)->toBe(['sample', 'text']);
});

it('Tokenizer@tokenize: Seperates the text with multiple separators', function (): void {
// Arrange
$tokenizer = new Tokenizer();
$tokenizer
->addWordSeparatorPattern(' ')
->addWordSeparatorPattern(';');
test('it can separate the text from multiple separators', function (): void {
// Arrange
$tokenizer = new Tokenizer();
$tokenizer
->addWordSeparatorPattern(' ')
->addWordSeparatorPattern(';');

$text = 'sample text;sample;text';
$text = 'sample text;sample;text';

// Act
$result = $tokenizer->tokenize($text);
// Act
$result = $tokenizer->tokenize($text);

// Assert
expect($result)->toBe(['sample', 'text', 'sample', 'text']);
});
// Assert
expect($result)->toBe(['sample', 'text', 'sample', 'text']);
});

it('Tokenizer@tokenize: Seperates the text with regex patterns', function (): void {
// Arrange
$tokenizer = new Tokenizer();
$tokenizer->addWordSeparatorPattern('\s');
test('it can separate the text from regex patterns', function (): void {
// Arrange
$tokenizer = new Tokenizer();
$tokenizer->addWordSeparatorPattern('\s');

$text = 'sample text '.PHP_EOL.'sample text';
$text = 'sample text '.PHP_EOL.'sample text';

// Act
$result = $tokenizer->tokenize($text);
// Act
$result = $tokenizer->tokenize($text);

// Assert
expect($result)->toBe(['sample', 'text', 'sample', 'text']);
});
// Assert
expect($result)->toBe(['sample', 'text', 'sample', 'text']);
});

it('Tokenizer@tokenize: A minimum word length can be set', function (): void {
// Arrange
$tokenizer = new Tokenizer();
$tokenizer->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR);
test('A minimum word length can be set', function (): void {
// Arrange
$tokenizer = new Tokenizer();
$tokenizer->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR);

$text = 'A sample text with a some meaningless words';
$text = 'A sample text with some meaningless words';

// Act
$result = $tokenizer->tokenize($text, 6);
// Act
$result = $tokenizer->tokenize(text: $text, minWordLength: 6);

// Assert
expect($result)->toBe(['sample', 'meaningless']);
});
// Assert
expect($result)->toBe(['sample', 'meaningless']);
});

it('Tokenizer@sentences: Separates the text with the given punctuation into sentences', function (): void {
// Act
$tokenizer = new Tokenizer();
$tokenizer
->addSentenceSeparatorPattern('.')
->addSentenceSeparatorPattern('!')
->addSentenceSeparatorPattern('?');

$text = 'Sample Sentence. Sample Sentence! Sample Sentence? Sample Sentence no. 4?! Sample sample sentence... End';
it('it can filter the tokens by given removal rule', function (): void {
// Arrange
$tokenizer = (new Tokenizer())
->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR);

// Act
$result = $tokenizer->sentences($text);
// Act I
$tokenizer->addWordFilterRule('/m/');
// Assert I
expect($tokenizer->tokenize('sample text'))->toBe(['saple', 'text']);

// Assert
expect($result)->toBe([
'Sample Sentence.',
'Sample Sentence!',
'Sample Sentence?',
'Sample Sentence no.',
'4?!',
'Sample sample sentence...',
'End',
]);
// Act II
$tokenizer->addWordFilterRule('/x/', 'q');
// Assert II
expect($tokenizer->tokenize('sample text'))->toBe(['saple', 'teqt']);
});
});

it('Tokenizer@tokenizeBySentences: Separates the text into tokens by sentences', function (): void {
// Arrange
$tokenizer = new Tokenizer();
$tokenizer
->addSentenceSeparatorPattern('.')
->addSentenceSeparatorPattern('!')
->addSentenceSeparatorPattern('?')
->addWordFilterRule(TokenizerFilterType::NO_SYMBOLS)
->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR);

$text = 'Sample Sentence. Sample Sentence! Sample Sentence? Sample Sentence no. 4?! Sample sample sentence... End';

// Act
$result = $tokenizer->tokenizeBySentences($text);

// Assert
expect($result)->toBe([
['Sample', 'Sentence'],
['Sample', 'Sentence'],
['Sample', 'Sentence'],
['Sample', 'Sentence', 'no'],
['Sample', 'sample', 'sentence'],
['End'],
]);
describe('Tokenizer@sentences', function () {
test('it can separate the text with the given punctuation into sentences', function (): void {
// Act
$tokenizer = new Tokenizer();
$tokenizer
->addSentenceSeparatorPattern('.')
->addSentenceSeparatorPattern('!')
->addSentenceSeparatorPattern('?');

$text = 'Sample Sentence. Sample Sentence! Sample Sentence? Sample Sentence no. 4?! Sample sample sentence... End';

// Act
$result = $tokenizer->sentences($text);

// Assert
expect($result)->toBe([
'Sample Sentence.',
'Sample Sentence!',
'Sample Sentence?',
'Sample Sentence no.',
'4?!',
'Sample sample sentence...',
'End',
]);
});
});

it('Tokenizer@tokenize: Filters the tokens by given removal rule', function (): void {
// Arrange
$tokenizer = (new Tokenizer())
->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR);

// Act I
$tokenizer->addWordFilterRule('/m/');
// Assert I
expect($tokenizer->tokenize('sample text'))->toBe(['saple', 'text']);

// Act II
$tokenizer->addWordFilterRule('/x/', 'q');
// Assert II
expect($tokenizer->tokenize('sample text'))->toBe(['saple', 'teqt']);
describe('Tokenizer@tokenizeBySentences', function () {
test('it can separate the text into tokens by sentences', function (): void {
// Arrange
$tokenizer = new Tokenizer();
$tokenizer
->addSentenceSeparatorPattern('.')
->addSentenceSeparatorPattern('!')
->addSentenceSeparatorPattern('?')
->addWordFilterRule(TokenizerFilterType::NO_SYMBOLS)
->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR);

$text = 'Sample Sentence. Sample Sentence! Sample Sentence? Sample Sentence no. 4?! Sample sample sentence... End';

// Act
$result = $tokenizer->tokenizeBySentences($text);

// Assert
expect($result)->toBe([
['Sample', 'Sentence'],
['Sample', 'Sentence'],
['Sample', 'Sentence'],
['Sample', 'Sentence', 'no'],
['Sample', 'sample', 'sentence'],
['End'],
]);
});
});

it('Tokenizer: Can convert tokens to lowercase', function (): void {
test('it can convert tokens to lowercase', function (): void {
// Arrange
$tokenizer = (new Tokenizer())
->addWordSeparatorPattern(TokenizerFilterType::WHITESPACE_SEPARATOR)
Expand All @@ -145,7 +151,7 @@
expect($result)->toBe(['sample', 'text']);
});

test('Tokenizer: Can be converted to an array', function (): void {
test('it can convert tokens to an array', function (): void {
// Arrange
$tokenizer = (new Tokenizer())
->addWordFilterRule(TokenizerFilterType::NO_SYMBOLS)
Expand Down

0 comments on commit 2ad391a

Please sign in to comment.