Simplify Text_Parser by removing unused methods

mundschenk-at · Mar 17, 2024 · f84c3e9 · f84c3e9
1 parent aade28f
commit f84c3e9
Show file tree

Hide file tree

Showing 4 changed files with 73 additions and 255 deletions.
diff --git a/src/class-text-parser.php b/src/class-text-parser.php
@@ -31,10 +31,11 @@
 use PHP_Typography\Text_Parser\Token;
 
 /**
- * A class to parse plain text (such as the data of DOMText).
+ * A class to parse plain text (such as the data of DOMText). If multibyte characters are passed,
+ * they must be encoded as UTF-8.
  *
- * Parse_Text assumes no HTML markup in the text (except for special html characters like &gt;).
- * If multibyte characters are passed, they must be encoded as UTF-8.
+ * @since 7.0.0 The `load`, `reload`, and `clear` methods have been removed in favor creating new parser
+ *              objects for each text fragment. The method `unload` has been replaced with `get_text`.
  */
 class Text_Parser {
 
@@ -261,41 +262,26 @@ class Text_Parser {
 	 *
 	 * @var callable
 	 */
-	private $current_strtoupper = 'strtoupper';
+	private $current_strtoupper;
 
 	/**
 	 * The tokenized text.
 	 *
 	 * @var Token[] $text Numerically indexed tokens.
 	 */
-	private $text = [];
+	private array $text = [];
 
 	/**
-	 * Creates a new parser object.
-	 */
-	public function __construct() {
-	}
-
-	/**
-	 * Tokenizes a string and stores the tokens in $this->text.
-	 *
-	 * @param string $raw_text A text fragment without any HTML markup.
+	 * Creates a new parser object and parses the given text.
 	 *
-	 * @return bool Returns `true` on successful completion, `false` otherwise.
+	 * @param string $text A text fragment without any HTML markup.
 	 */
-	public function load( string $raw_text ) {
-		if ( empty( $raw_text ) ) {
-			return false; // Can't tokenize an empty string.
-		}
-
+	public function __construct( string $text ) {
 		// Detect encoding.
-		$this->current_strtoupper = Strings::functions( $raw_text )['strtoupper'];
+		$this->current_strtoupper = Strings::functions( $text )['strtoupper'];
 
 		// Tokenize the raw text parts.
-		$this->text = self::tokenize( \preg_split( self::RE_ANY_TEXT, $raw_text, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ) ?: [] ); // phpcs:ignore Universal.Operators.DisallowShortTernary -- Ensure array type in case of error.
-
-		// The token array should never be empty.
-		return ! empty( $this->text );
+		$this->text = self::tokenize( \preg_split( self::RE_ANY_TEXT, $text, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY ) ?: [] ); // phpcs:ignore Universal.Operators.DisallowShortTernary -- Ensure array type in case of error.
 	}
 
 	/**
@@ -387,42 +373,23 @@ protected static function is_not_preceeded_by( $type, array $tokens, $index, $st
 		return $index - $steps >= 0 && $type !== $tokens[ $index - $steps ]->type;
 	}
 
-
 	/**
-	 * Reloads $this->text (i.e. capture new inserted text, or remove those tokens whose values have been deleted).
+	 * Returns the complete text as a string.
 	 *
-	 * Warning: Tokens previously acquired through 'get' methods may not match new tokenization.
-	 *
-	 * @return bool Returns true on successful completion.
-	 */
-	public function reload() {
-		return $this->load( $this->unload() );
-	}
-
-	/**
-	 * Returns the complete text as a string and clears the parser.
+	 * @since 7.0.0
 	 *
 	 * @return string
 	 */
-	public function unload() {
+	public function get_text(): string {
 		$reassembled_text = '';
 
 		foreach ( $this->text as $token ) {
 			$reassembled_text .= $token->value;
 		}
 
-		$this->clear();
-
 		return $reassembled_text;
 	}
 
-	/**
-	 * Clears the currently set text from the parser.
-	 */
-	public function clear(): void {
-		$this->text = [];
-	}
-
 	/**
 	 * Updates the 'value' field for all matching tokens.
 	 *
@@ -471,11 +438,6 @@ public function get_punctuation() {
 	 * @return Token[] An array of numerically indexed tokens.
 	 */
 	public function get_words( $abc = self::ALLOW_ALL_LETTERS, $caps = self::ALLOW_ALL_CAPS, $comps = self::ALLOW_COMPOUNDS ) {
-		// Return early if no text has been loaded.
-		if ( empty( $this->text ) ) {
-			return []; // abort.
-		}
-
 		// Result set.
 		$tokens = [];
 

diff --git a/src/fixes/node-fixes/class-process-words-fix.php b/src/fixes/node-fixes/class-process-words-fix.php
@@ -55,13 +55,6 @@ class Process_Words_Fix extends Abstract_Node_Fix {
 	 */
 	private $token_fixes = [];
 
-	/**
-	 * A custom parser for \DOMText to separate words, whitespace etc. for HTML injection.
-	 *
-	 * @var Text_Parser|null
-	 */
-	private $text_parser;
-
 	/**
 	 * Apply the fix to a given textnode.
 	 *
@@ -74,18 +67,14 @@ class Process_Words_Fix extends Abstract_Node_Fix {
 	 * @return void
 	 */
 	public function apply( \DOMText $textnode, Settings $settings, $is_title ) {
-		// Lazy-load text parser.
-		$text_parser = $this->get_text_parser();
-		$tokens      = [];
-
 		// Set up parameters for word categories.
 		$mixed_caps       = empty( $settings[ Settings::HYPHENATE_ALL_CAPS ] ) ? Text_Parser::ALLOW_ALL_CAPS : Text_Parser::NO_ALL_CAPS;
 		$letter_caps      = empty( $settings[ Settings::HYPHENATE_ALL_CAPS ] ) ? Text_Parser::NO_ALL_CAPS : Text_Parser::ALLOW_ALL_CAPS;
 		$mixed_compounds  = empty( $settings[ Settings::HYPHENATE_COMPOUNDS ] ) ? Text_Parser::ALLOW_COMPOUNDS : Text_Parser::NO_COMPOUNDS;
 		$letter_compounds = empty( $settings[ Settings::HYPHENATE_COMPOUNDS ] ) ? Text_Parser::NO_COMPOUNDS : Text_Parser::ALLOW_COMPOUNDS;
 
 		// Break text down for a bit more granularity.
-		$text_parser->load( $textnode->data );
+		$text_parser                         = $this->get_text_parser( $textnode->data );
 		$tokens[ Token_Fix::MIXED_WORDS ]    = $text_parser->get_words( Text_Parser::NO_ALL_LETTERS, $mixed_caps, $mixed_compounds );  // prohibit letter-only words, allow caps, allow compounds (or not).
 		$tokens[ Token_Fix::COMPOUND_WORDS ] = ! empty( $settings[ Settings::HYPHENATE_COMPOUNDS ] ) ? $text_parser->get_words( Text_Parser::NO_ALL_LETTERS, $letter_caps, Text_Parser::REQUIRE_COMPOUNDS ) : [];
 		$tokens[ Token_Fix::WORDS ]          = $text_parser->get_words( Text_Parser::REQUIRE_ALL_LETTERS, $letter_caps, $letter_compounds ); // require letter-only words allow/prohibit caps & compounds vice-versa.
@@ -100,21 +89,20 @@ public function apply( \DOMText $textnode, Settings $settings, $is_title ) {
 
 		// Apply updates to our text.
 		$text_parser->update( $tokens[ Token_Fix::MIXED_WORDS ] + $tokens[ Token_Fix::COMPOUND_WORDS ] + $tokens[ Token_Fix::WORDS ] + $tokens[ Token_Fix::OTHER ] );
-		$textnode->data = $text_parser->unload();
+		$textnode->data = $text_parser->get_text();
 	}
 
 	/**
 	 * Retrieves the text parser instance.
 	 *
-	 * @return \PHP_Typography\Text_Parser
+	 * @since 7.0.0 Parameter $text added.
+	 *
+	 * @param string $text The text to tokenize.
+	 *
+	 * @return Text_Parser
 	 */
-	public function get_text_parser() {
-		// Lazy-load text parser.
-		if ( ! isset( $this->text_parser ) ) {
-			$this->text_parser = new Text_Parser();
-		}
-
-		return $this->text_parser;
+	public function get_text_parser( string $text ) {
+		return new Text_Parser( $text );
 	}
 
 	/**