Add Tokenizer.plus_operator option

TeamSpen210 · May 24, 2024 · 40c662f · 40c662f
1 parent 1640689
commit 40c662f
Show file tree

Hide file tree

Showing 6 changed files with 97 additions and 69 deletions.
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -16,6 +16,9 @@ Version (dev)
   allow reporting the source location in exceptions.
 * :py:meth:`Keyvalues.export() <srctools.keyvalues.Keyvalues.export>` is now deprecated.
 * Allow directly passing enums to set VMF keyvalues and fixups, if the ``value`` is itself a valid value.
+* Add :py:attr:`~srctools.tokenizer.Tokenizer.plus_operator`, allowing `+` to be
+  parsed as an operator for FGDs but still be valid inside bare strings elsewhere.
+  These are common in ``gameinfo.txt``.
 
 --------------
 Version 2.3.17

diff --git a/src/srctools/_tokenizer.pyx b/src/srctools/_tokenizer.pyx
@@ -69,7 +69,7 @@ cdef:
 
 # Characters not allowed for bare names on a line.
 # TODO: Make this an actual constant value, but that won't do the switch optimisation.
-DEF BARE_DISALLOWED = b'"\'{};,=+[]()\r\n\t '
+DEF BARE_DISALLOWED = b'"\'{};,=[]()\r\n\t '
 
 # Pack flags into a bitfield.
 cdef extern from *:
@@ -78,6 +78,7 @@ struct TokFlags {
     unsigned char string_brackets: 1;
     unsigned char allow_escapes: 1;
     unsigned char colon_operator: 1;
+    unsigned char plus_operator: 1;
     unsigned char allow_star_comments: 1;
     unsigned char preserve_comments: 1;
     unsigned char file_input: 1;
@@ -88,6 +89,7 @@ struct TokFlags {
         bint string_brackets
         bint allow_escapes
         bint colon_operator
+        bint plus_operator
         bint allow_star_comments
         bint preserve_comments
         # If set, the file_iter is a bound read() method.
@@ -419,11 +421,12 @@ cdef class Tokenizer(BaseTokenizer):
         object filename=None,
         error=None,
         *,
-        bint string_bracket=False,
-        bint allow_escapes=True,
-        bint allow_star_comments=False,
-        bint preserve_comments=False,
-        bint colon_operator=False,
+        bint string_bracket: bool = False,
+        bint allow_escapes: bool = True,
+        bint allow_star_comments: bool = False,
+        bint preserve_comments: bool = False,
+        bint colon_operator: bool = False,
+        bint plus_operator: bool = False,
     ):
         # Early warning for this particular error.
         if isinstance(data, bytes) or isinstance(data, bytearray):
@@ -438,6 +441,7 @@ cdef class Tokenizer(BaseTokenizer):
             'allow_star_comments': allow_star_comments,
             'preserve_comments': preserve_comments,
             'colon_operator': colon_operator,
+            'plus_operator': plus_operator,
             'file_input': 0,
             'last_was_cr': 0,
         }
@@ -492,60 +496,61 @@ cdef class Tokenizer(BaseTokenizer):
 
     @property
     def string_bracket(self) -> bool:
-        """Check if [bracket] blocks are parsed as a single string-like block.
+        """Controls whether [bracket] blocks are parsed as a single string-like block.
 
         If disabled these are parsed as BRACK_OPEN, STRING, BRACK_CLOSE.
         """
         return self.flags.string_brackets
 
     @string_bracket.setter
     def string_bracket(self, bint value) -> None:
-        """Set if [bracket] blocks are parsed as a single string-like block.
-
-        If disabled these are parsed as BRACK_OPEN, STRING, BRACK_CLOSE.
-        """
         self.flags.string_brackets = value
 
     @property
     def allow_escapes(self) -> bool:
-        """Check if backslash escapes will be parsed."""
+        """Controls whether backslash escapes will be parsed."""
         return self.flags.allow_escapes
 
     @allow_escapes.setter
     def allow_escapes(self, bint value) -> None:
-        """Set if backslash escapes will be parsed."""
         self.flags.allow_escapes = value
 
     @property
     def allow_star_comments(self) -> bool:
-        """Check if /**/ style comments will be enabled."""
+        """Controls whether /**/ style comments will be enabled."""
         return self.flags.allow_star_comments
 
     @allow_star_comments.setter
     def allow_star_comments(self, bint value) -> None:
-        """Set if /**/ style comments are enabled."""
         self.flags.allow_star_comments = value
 
     @property
     def preserve_comments(self) -> bool:
-        """Check if comments will be output as tokens."""
+        """Controls whether comments will be output as tokens."""
         return self.flags.preserve_comments
 
     @preserve_comments.setter
     def preserve_comments(self, bint value) -> None:
-        """Set if comments will be output as tokens."""
         self.flags.preserve_comments = value
 
     @property
     def colon_operator(self) -> bool:
-        """Check if : characters are treated as a COLON token, or part of strings."""
+        """Controls whether : characters are treated as a COLON token, or part of strings."""
         return self.flags.colon_operator
 
     @colon_operator.setter
     def colon_operator(self, bint value) -> None:
-        """Set if : characters are treated as a COLON token, or part of strings."""
         self.flags.colon_operator = value
 
+    @property
+    def plus_operator(self) -> bool:
+        """Controls whether + characters are treated as a PLUS token, or part of strings."""
+        return self.flags.plus_operator
+
+    @plus_operator.setter
+    def plus_operator(self, bint value) -> None:
+        self.flags.plus_operator = value
+
     cdef inline bint buf_reset(self) except False:
         """Reset the temporary buffer."""
         # Don't bother resizing or clearing, the next append will overwrite.
@@ -669,8 +674,6 @@ cdef class Tokenizer(BaseTokenizer):
                 return BRACE_OPEN_TUP
             elif next_char == b'}':
                 return BRACE_CLOSE_TUP
-            elif next_char == b'+':
-                return PLUS_TUP
             elif next_char == b'=':
                 return EQUALS_TUP
             elif next_char == b',':
@@ -896,6 +899,8 @@ cdef class Tokenizer(BaseTokenizer):
             else:  # These complex checks can't be in a switch, so we need to nest this.
                 if next_char == b':' and self.flags.colon_operator:
                     return COLON_TUP
+                if next_char == b'+' and self.flags.plus_operator:
+                    return PLUS_TUP
                 # Bare names
                 if next_char not in BARE_DISALLOWED:
                     self.buf_reset()
@@ -908,12 +913,12 @@ cdef class Tokenizer(BaseTokenizer):
                             return STRING, self.buf_get_text()
 
                         elif (
-                            next_char in BARE_DISALLOWED or
-                            (next_char == b':' and self.flags.colon_operator)
+                            next_char in BARE_DISALLOWED
+                            or (next_char == b':' and self.flags.colon_operator)
+                            or (next_char == b'+' and self.flags.plus_operator)
                         ):  # We need to repeat this so we return the ending
                             # char next. If it's not allowed, that'll error on
                             # next call.
-                            # We need to repeat this so we return the newline.
                             self.char_index -= 1
                             return STRING, self.buf_get_text()
                         else:

diff --git a/src/srctools/fgd.py b/src/srctools/fgd.py
@@ -2192,6 +2192,7 @@ def parse_file(
                 error=FGDParseError,
                 string_bracket=False,
                 colon_operator=True,
+                plus_operator=True,
             )
             for token, token_value in tokeniser:
                 # The only things at top-level would be bare strings, and empty lines.

diff --git a/src/srctools/tokenizer.py b/src/srctools/tokenizer.py
@@ -116,9 +116,9 @@ class Token(Enum):
     BRACK_OPEN = 12  #: A ``[`` character. Only used if ``PROP_FLAG`` is not.
     BRACK_CLOSE = 13  #: A ``]`` character.
 
-    COLON = 14  #: A ``:`` character.
+    COLON = 14  #: A ``:`` character, if :py:attr:`~Tokenizer.colon_operator` is enabled.
     EQUALS = 15  #: A ``=`` character.
-    PLUS = 16  #: A ``+`` character.
+    PLUS = 16  #: A ``+`` character, if :py:attr:`Tokenizer.plus_operator` is enabled.
     COMMA = 17  #: A ``,`` character.
 
     @property
@@ -149,7 +149,6 @@ def has_value(self) -> bool:
     '}': Token.BRACE_CLOSE,
 
     '=': Token.EQUALS,
-    '+': Token.PLUS,
     ',': Token.COMMA,
 }
 
@@ -167,7 +166,7 @@ def has_value(self) -> bool:
 }
 
 #: Characters not allowed for bare strings. These must be quoted.
-BARE_DISALLOWED: Final = frozenset('"\'{};,=+[]()\r\n\t ')
+BARE_DISALLOWED: Final = frozenset('"\'{};,=[]()\r\n\t ')
 
 
 class BaseTokenizer(abc.ABC):
@@ -389,6 +388,8 @@ class Tokenizer(BaseTokenizer):
     * preserve_comments causes :py:const:`Token.COMMENT` tokens to be produced.
     * colon_operator controls if ``:`` produces :py:const:`~Token.COLON` tokens, or is treated as
       a bare string.
+    * plus_operator controls if ``+`` produces :py:const:`~Token.PLUS` tokens, or is treated as
+      a bare string.
     """
     chunk_iter: Iterator[str]
     cur_chunk: str
@@ -398,6 +399,7 @@ class Tokenizer(BaseTokenizer):
     allow_star_comments: bool
     preserve_comments: bool
     colon_operator: bool
+    plus_operator: bool
     _last_was_cr: bool
 
     def __init__(
@@ -411,6 +413,7 @@ def __init__(
         allow_star_comments: bool = False,
         preserve_comments: bool = False,
         colon_operator: bool = False,
+        plus_operator: bool = False,
     ) -> None:
         # If a file-like object, automatically use the configured name.
         if filename is None and hasattr(data, 'name'):
@@ -440,6 +443,7 @@ def __init__(
         self.allow_escapes = bool(allow_escapes)
         self.allow_star_comments = bool(allow_star_comments)
         self.colon_operator = bool(colon_operator)
+        self.plus_operator = bool(plus_operator)
         self.preserve_comments = bool(preserve_comments)
         self._last_was_cr = False
 
@@ -552,6 +556,9 @@ def _get_token(self) -> Tuple[Token, str]:
             elif next_char == ':' and self.colon_operator:
                 return Token.COLON, ':'
 
+            elif next_char == '+' and self.plus_operator:
+                return Token.PLUS, '+'
+
             elif next_char == ']':
                 if self.string_bracket:
                     # If string_bracket is set (using PROP_FLAG), this is a
@@ -582,7 +589,11 @@ def _get_token(self) -> Tuple[Token, str]:
                 value_chars = [next_char]
                 while True:
                     next_char = self._next_char()
-                    if next_char in BARE_DISALLOWED or (next_char == ':' and self.colon_operator):
+                    if (
+                        next_char in BARE_DISALLOWED
+                        or (next_char == ':' and self.colon_operator)
+                        or (next_char == '+' and self.plus_operator)
+                    ):
                         # We need to repeat this, so we return the ending char next.
                         # If it's not allowed, that'll error on next call.
                         self.char_index -= 1

diff --git a/tests/test_keyvalues.py b/tests/test_keyvalues.py
@@ -120,7 +120,7 @@ def test_names() -> None:
         "Extra"        "Spaces"
     // "Commented" "out"
     "Block"  {
-        "Empty"
+        Empty
              {
              } }
     "Block" // "with value"
@@ -133,7 +133,7 @@ def test_names() -> None:
     "Root2"
     {
     "Name with \\" in it" "Value with \\" inside"
-    "multiline" "text
+    multi+line "text
 \tcan continue
 for many \\"lines\\" of
   possibly indented
@@ -151,8 +151,8 @@ def test_names() -> None:
         "Flag" "blocksthis" [!test_enabled]
 
         "Replaced" "shouldbe"
-        "Replaced" "toreplace" [test_enabled]
-        "Replaced" "alsothis"  [test_enabled]
+        "Replaced" to+replace [test_enabled]
+        "Replaced" also+this  [test_enabled]
         
         "Replaced" "shouldbe2"
         "Replaced" "toreplace2" [!test_disabled]
@@ -165,8 +165,8 @@ def test_names() -> None:
             }
         "Replaced" [test_enabled]
             {
-            "lambda" "should"
-            "replace" "above"
+            lambda should
+            replace above
             }
         
         "Replaced"
@@ -208,7 +208,7 @@ def test_names() -> None:
     ]),
     P('Root2', [
         P('Name with " in it', 'Value with \" inside'),
-        P('multiline',
+        P('multi+line',
           'text\n\tcan continue\nfor many "lines" of\n  possibly indented\n\ntext'
           ),
         # Note, invalid = unchanged.
@@ -219,8 +219,8 @@ def test_names() -> None:
         P('after ', 'value'),
         P('Flag', 'allowed'),
         P('FlagAllows', 'This'),
-        P('Replaced', 'toreplace'),
-        P('Replaced', 'alsothis'),
+        P('Replaced', 'to+replace'),
+        P('Replaced', 'also+this'),
         P('Replaced', 'toreplace2'),
         P('Replaced', 'alsothis2'),
         P('Replaced', [
@@ -284,7 +284,7 @@ def test_build() -> None:
                     b.block('he\tre')
         with b.Root2:
             b['Name with " in it']('Value with \" inside')
-            b.multiline(
+            b['multi+line'](
               'text\n\tcan continue\nfor many "lines" of\n  possibly '
               'indented\n\ntext'
             )
@@ -297,8 +297,8 @@ def test_build() -> None:
             b['after ']('value')
             b.Flag('allowed')
             b.FlagAllows('This')
-            b.Replaced('toreplace')
-            b.Replaced('alsothis')
+            b.Replaced('to+replace')
+            b.Replaced('also+this')
             b.Replaced('toreplace2')
             b.Replaced('alsothis2')
             with b.Replaced: