Skip to content

Commit

Permalink
Simplify Text_Parser by removing unused methods
Browse files Browse the repository at this point in the history
  • Loading branch information
mundschenk-at committed Mar 17, 2024
1 parent aade28f commit f84c3e9
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 255 deletions.
66 changes: 14 additions & 52 deletions src/class-text-parser.php
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,11 @@
use PHP_Typography\Text_Parser\Token;

/**
* A class to parse plain text (such as the data of DOMText).
* A class to parse plain text (such as the data of DOMText). If multibyte characters are passed,
* they must be encoded as UTF-8.
*
* Parse_Text assumes no HTML markup in the text (except for special html characters like >).
* If multibyte characters are passed, they must be encoded as UTF-8.
* @since 7.0.0 The `load`, `reload`, and `clear` methods have been removed in favor creating new parser
* objects for each text fragment. The method `unload` has been replaced with `get_text`.
*/
class Text_Parser {

Expand Down Expand Up @@ -261,41 +262,26 @@ class Text_Parser {
*
* @var callable
*/
private $current_strtoupper = 'strtoupper';
private $current_strtoupper;

/**
* The tokenized text.
*
* @var Token[] $text Numerically indexed tokens.
*/
private $text = [];
private array $text = [];

/**
* Creates a new parser object.
*/
public function __construct() {
}

/**
* Tokenizes a string and stores the tokens in $this->text.
*
* @param string $raw_text A text fragment without any HTML markup.
* Creates a new parser object and parses the given text.
*
* @return bool Returns `true` on successful completion, `false` otherwise.
* @param string $text A text fragment without any HTML markup.
*/
public function load( string $raw_text ) {
if ( empty( $raw_text ) ) {
return false; // Can't tokenize an empty string.
}

public function __construct( string $text ) {
// Detect encoding.
$this->current_strtoupper = Strings::functions( $raw_text )['strtoupper'];
$this->current_strtoupper = Strings::functions( $text )['strtoupper'];

// Tokenize the raw text parts.
$this->text = self::tokenize( \preg_split( self::RE_ANY_TEXT, $raw_text, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ) ?: [] ); // phpcs:ignore Universal.Operators.DisallowShortTernary -- Ensure array type in case of error.

// The token array should never be empty.
return ! empty( $this->text );
$this->text = self::tokenize( \preg_split( self::RE_ANY_TEXT, $text, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ) ?: [] ); // phpcs:ignore Universal.Operators.DisallowShortTernary -- Ensure array type in case of error.
}

/**
Expand Down Expand Up @@ -387,42 +373,23 @@ protected static function is_not_preceeded_by( $type, array $tokens, $index, $st
return $index - $steps >= 0 && $type !== $tokens[ $index - $steps ]->type;
}


/**
* Reloads $this->text (i.e. capture new inserted text, or remove those tokens whose values have been deleted).
* Returns the complete text as a string.
*
* Warning: Tokens previously acquired through 'get' methods may not match new tokenization.
*
* @return bool Returns true on successful completion.
*/
public function reload() {
return $this->load( $this->unload() );
}

/**
* Returns the complete text as a string and clears the parser.
* @since 7.0.0
*
* @return string
*/
public function unload() {
public function get_text(): string {
$reassembled_text = '';

foreach ( $this->text as $token ) {
$reassembled_text .= $token->value;
}

$this->clear();

return $reassembled_text;
}

/**
* Clears the currently set text from the parser.
*/
public function clear(): void {
$this->text = [];
}

/**
* Updates the 'value' field for all matching tokens.
*
Expand Down Expand Up @@ -471,11 +438,6 @@ public function get_punctuation() {
* @return Token[] An array of numerically indexed tokens.
*/
public function get_words( $abc = self::ALLOW_ALL_LETTERS, $caps = self::ALLOW_ALL_CAPS, $comps = self::ALLOW_COMPOUNDS ) {
// Return early if no text has been loaded.
if ( empty( $this->text ) ) {
return []; // abort.
}

// Result set.
$tokens = [];

Expand Down
30 changes: 9 additions & 21 deletions src/fixes/node-fixes/class-process-words-fix.php
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,6 @@ class Process_Words_Fix extends Abstract_Node_Fix {
*/
private $token_fixes = [];

/**
* A custom parser for \DOMText to separate words, whitespace etc. for HTML injection.
*
* @var Text_Parser|null
*/
private $text_parser;

/**
* Apply the fix to a given textnode.
*
Expand All @@ -74,18 +67,14 @@ class Process_Words_Fix extends Abstract_Node_Fix {
* @return void
*/
public function apply( \DOMText $textnode, Settings $settings, $is_title ) {
// Lazy-load text parser.
$text_parser = $this->get_text_parser();
$tokens = [];

// Set up parameters for word categories.
$mixed_caps = empty( $settings[ Settings::HYPHENATE_ALL_CAPS ] ) ? Text_Parser::ALLOW_ALL_CAPS : Text_Parser::NO_ALL_CAPS;
$letter_caps = empty( $settings[ Settings::HYPHENATE_ALL_CAPS ] ) ? Text_Parser::NO_ALL_CAPS : Text_Parser::ALLOW_ALL_CAPS;
$mixed_compounds = empty( $settings[ Settings::HYPHENATE_COMPOUNDS ] ) ? Text_Parser::ALLOW_COMPOUNDS : Text_Parser::NO_COMPOUNDS;
$letter_compounds = empty( $settings[ Settings::HYPHENATE_COMPOUNDS ] ) ? Text_Parser::NO_COMPOUNDS : Text_Parser::ALLOW_COMPOUNDS;

// Break text down for a bit more granularity.
$text_parser->load( $textnode->data );
$text_parser = $this->get_text_parser( $textnode->data );
$tokens[ Token_Fix::MIXED_WORDS ] = $text_parser->get_words( Text_Parser::NO_ALL_LETTERS, $mixed_caps, $mixed_compounds ); // prohibit letter-only words, allow caps, allow compounds (or not).
$tokens[ Token_Fix::COMPOUND_WORDS ] = ! empty( $settings[ Settings::HYPHENATE_COMPOUNDS ] ) ? $text_parser->get_words( Text_Parser::NO_ALL_LETTERS, $letter_caps, Text_Parser::REQUIRE_COMPOUNDS ) : [];
$tokens[ Token_Fix::WORDS ] = $text_parser->get_words( Text_Parser::REQUIRE_ALL_LETTERS, $letter_caps, $letter_compounds ); // require letter-only words allow/prohibit caps & compounds vice-versa.
Expand All @@ -100,21 +89,20 @@ public function apply( \DOMText $textnode, Settings $settings, $is_title ) {

// Apply updates to our text.
$text_parser->update( $tokens[ Token_Fix::MIXED_WORDS ] + $tokens[ Token_Fix::COMPOUND_WORDS ] + $tokens[ Token_Fix::WORDS ] + $tokens[ Token_Fix::OTHER ] );
$textnode->data = $text_parser->unload();
$textnode->data = $text_parser->get_text();
}

/**
* Retrieves the text parser instance.
*
* @return \PHP_Typography\Text_Parser
* @since 7.0.0 Parameter $text added.
*
* @param string $text The text to tokenize.
*
* @return Text_Parser
*/
public function get_text_parser() {
// Lazy-load text parser.
if ( ! isset( $this->text_parser ) ) {
$this->text_parser = new Text_Parser();
}

return $this->text_parser;
public function get_text_parser( string $text ) {
return new Text_Parser( $text );
}

/**
Expand Down
Loading

0 comments on commit f84c3e9

Please sign in to comment.