diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst index 4ed9cbe1..3cd3286d 100644 --- a/docs/source/changelog.rst +++ b/docs/source/changelog.rst @@ -16,6 +16,9 @@ Version (dev) allow reporting the source location in exceptions. * :py:meth:`Keyvalues.export() ` is now deprecated. * Allow directly passing enums to set VMF keyvalues and fixups, if the ``value`` is itself a valid value. +* Add :py:attr:`~srctools.tokenizer.Tokenizer.plus_operator`, allowing `+` to be + parsed as an operator for FGDs but still be valid inside bare strings elsewhere. + These are common in ``gameinfo.txt``. -------------- Version 2.3.17 diff --git a/src/srctools/_tokenizer.pyx b/src/srctools/_tokenizer.pyx index 2883682c..303a0f76 100644 --- a/src/srctools/_tokenizer.pyx +++ b/src/srctools/_tokenizer.pyx @@ -69,7 +69,7 @@ cdef: # Characters not allowed for bare names on a line. # TODO: Make this an actual constant value, but that won't do the switch optimisation. -DEF BARE_DISALLOWED = b'"\'{};,=+[]()\r\n\t ' +DEF BARE_DISALLOWED = b'"\'{};,=[]()\r\n\t ' # Pack flags into a bitfield. cdef extern from *: @@ -78,6 +78,7 @@ struct TokFlags { unsigned char string_brackets: 1; unsigned char allow_escapes: 1; unsigned char colon_operator: 1; + unsigned char plus_operator: 1; unsigned char allow_star_comments: 1; unsigned char preserve_comments: 1; unsigned char file_input: 1; @@ -88,6 +89,7 @@ struct TokFlags { bint string_brackets bint allow_escapes bint colon_operator + bint plus_operator bint allow_star_comments bint preserve_comments # If set, the file_iter is a bound read() method. @@ -419,11 +421,12 @@ cdef class Tokenizer(BaseTokenizer): object filename=None, error=None, *, - bint string_bracket=False, - bint allow_escapes=True, - bint allow_star_comments=False, - bint preserve_comments=False, - bint colon_operator=False, + bint string_bracket: bool = False, + bint allow_escapes: bool = True, + bint allow_star_comments: bool = False, + bint preserve_comments: bool = False, + bint colon_operator: bool = False, + bint plus_operator: bool = False, ): # Early warning for this particular error. if isinstance(data, bytes) or isinstance(data, bytearray): @@ -438,6 +441,7 @@ cdef class Tokenizer(BaseTokenizer): 'allow_star_comments': allow_star_comments, 'preserve_comments': preserve_comments, 'colon_operator': colon_operator, + 'plus_operator': plus_operator, 'file_input': 0, 'last_was_cr': 0, } @@ -492,7 +496,7 @@ cdef class Tokenizer(BaseTokenizer): @property def string_bracket(self) -> bool: - """Check if [bracket] blocks are parsed as a single string-like block. + """Controls whether [bracket] blocks are parsed as a single string-like block. If disabled these are parsed as BRACK_OPEN, STRING, BRACK_CLOSE. """ @@ -500,52 +504,53 @@ cdef class Tokenizer(BaseTokenizer): @string_bracket.setter def string_bracket(self, bint value) -> None: - """Set if [bracket] blocks are parsed as a single string-like block. - - If disabled these are parsed as BRACK_OPEN, STRING, BRACK_CLOSE. - """ self.flags.string_brackets = value @property def allow_escapes(self) -> bool: - """Check if backslash escapes will be parsed.""" + """Controls whether backslash escapes will be parsed.""" return self.flags.allow_escapes @allow_escapes.setter def allow_escapes(self, bint value) -> None: - """Set if backslash escapes will be parsed.""" self.flags.allow_escapes = value @property def allow_star_comments(self) -> bool: - """Check if /**/ style comments will be enabled.""" + """Controls whether /**/ style comments will be enabled.""" return self.flags.allow_star_comments @allow_star_comments.setter def allow_star_comments(self, bint value) -> None: - """Set if /**/ style comments are enabled.""" self.flags.allow_star_comments = value @property def preserve_comments(self) -> bool: - """Check if comments will be output as tokens.""" + """Controls whether comments will be output as tokens.""" return self.flags.preserve_comments @preserve_comments.setter def preserve_comments(self, bint value) -> None: - """Set if comments will be output as tokens.""" self.flags.preserve_comments = value @property def colon_operator(self) -> bool: - """Check if : characters are treated as a COLON token, or part of strings.""" + """Controls whether : characters are treated as a COLON token, or part of strings.""" return self.flags.colon_operator @colon_operator.setter def colon_operator(self, bint value) -> None: - """Set if : characters are treated as a COLON token, or part of strings.""" self.flags.colon_operator = value + @property + def plus_operator(self) -> bool: + """Controls whether + characters are treated as a PLUS token, or part of strings.""" + return self.flags.plus_operator + + @plus_operator.setter + def plus_operator(self, bint value) -> None: + self.flags.plus_operator = value + cdef inline bint buf_reset(self) except False: """Reset the temporary buffer.""" # Don't bother resizing or clearing, the next append will overwrite. @@ -669,8 +674,6 @@ cdef class Tokenizer(BaseTokenizer): return BRACE_OPEN_TUP elif next_char == b'}': return BRACE_CLOSE_TUP - elif next_char == b'+': - return PLUS_TUP elif next_char == b'=': return EQUALS_TUP elif next_char == b',': @@ -896,6 +899,8 @@ cdef class Tokenizer(BaseTokenizer): else: # These complex checks can't be in a switch, so we need to nest this. if next_char == b':' and self.flags.colon_operator: return COLON_TUP + if next_char == b'+' and self.flags.plus_operator: + return PLUS_TUP # Bare names if next_char not in BARE_DISALLOWED: self.buf_reset() @@ -908,12 +913,12 @@ cdef class Tokenizer(BaseTokenizer): return STRING, self.buf_get_text() elif ( - next_char in BARE_DISALLOWED or - (next_char == b':' and self.flags.colon_operator) + next_char in BARE_DISALLOWED + or (next_char == b':' and self.flags.colon_operator) + or (next_char == b'+' and self.flags.plus_operator) ): # We need to repeat this so we return the ending # char next. If it's not allowed, that'll error on # next call. - # We need to repeat this so we return the newline. self.char_index -= 1 return STRING, self.buf_get_text() else: diff --git a/src/srctools/fgd.py b/src/srctools/fgd.py index cb6cd598..9fb690f0 100644 --- a/src/srctools/fgd.py +++ b/src/srctools/fgd.py @@ -2192,6 +2192,7 @@ def parse_file( error=FGDParseError, string_bracket=False, colon_operator=True, + plus_operator=True, ) for token, token_value in tokeniser: # The only things at top-level would be bare strings, and empty lines. diff --git a/src/srctools/tokenizer.py b/src/srctools/tokenizer.py index 887a7f53..7273ab8d 100644 --- a/src/srctools/tokenizer.py +++ b/src/srctools/tokenizer.py @@ -116,9 +116,9 @@ class Token(Enum): BRACK_OPEN = 12 #: A ``[`` character. Only used if ``PROP_FLAG`` is not. BRACK_CLOSE = 13 #: A ``]`` character. - COLON = 14 #: A ``:`` character. + COLON = 14 #: A ``:`` character, if :py:attr:`~Tokenizer.colon_operator` is enabled. EQUALS = 15 #: A ``=`` character. - PLUS = 16 #: A ``+`` character. + PLUS = 16 #: A ``+`` character, if :py:attr:`Tokenizer.plus_operator` is enabled. COMMA = 17 #: A ``,`` character. @property @@ -149,7 +149,6 @@ def has_value(self) -> bool: '}': Token.BRACE_CLOSE, '=': Token.EQUALS, - '+': Token.PLUS, ',': Token.COMMA, } @@ -167,7 +166,7 @@ def has_value(self) -> bool: } #: Characters not allowed for bare strings. These must be quoted. -BARE_DISALLOWED: Final = frozenset('"\'{};,=+[]()\r\n\t ') +BARE_DISALLOWED: Final = frozenset('"\'{};,=[]()\r\n\t ') class BaseTokenizer(abc.ABC): @@ -389,6 +388,8 @@ class Tokenizer(BaseTokenizer): * preserve_comments causes :py:const:`Token.COMMENT` tokens to be produced. * colon_operator controls if ``:`` produces :py:const:`~Token.COLON` tokens, or is treated as a bare string. + * plus_operator controls if ``+`` produces :py:const:`~Token.PLUS` tokens, or is treated as + a bare string. """ chunk_iter: Iterator[str] cur_chunk: str @@ -398,6 +399,7 @@ class Tokenizer(BaseTokenizer): allow_star_comments: bool preserve_comments: bool colon_operator: bool + plus_operator: bool _last_was_cr: bool def __init__( @@ -411,6 +413,7 @@ def __init__( allow_star_comments: bool = False, preserve_comments: bool = False, colon_operator: bool = False, + plus_operator: bool = False, ) -> None: # If a file-like object, automatically use the configured name. if filename is None and hasattr(data, 'name'): @@ -440,6 +443,7 @@ def __init__( self.allow_escapes = bool(allow_escapes) self.allow_star_comments = bool(allow_star_comments) self.colon_operator = bool(colon_operator) + self.plus_operator = bool(plus_operator) self.preserve_comments = bool(preserve_comments) self._last_was_cr = False @@ -552,6 +556,9 @@ def _get_token(self) -> Tuple[Token, str]: elif next_char == ':' and self.colon_operator: return Token.COLON, ':' + elif next_char == '+' and self.plus_operator: + return Token.PLUS, '+' + elif next_char == ']': if self.string_bracket: # If string_bracket is set (using PROP_FLAG), this is a @@ -582,7 +589,11 @@ def _get_token(self) -> Tuple[Token, str]: value_chars = [next_char] while True: next_char = self._next_char() - if next_char in BARE_DISALLOWED or (next_char == ':' and self.colon_operator): + if ( + next_char in BARE_DISALLOWED + or (next_char == ':' and self.colon_operator) + or (next_char == '+' and self.plus_operator) + ): # We need to repeat this, so we return the ending char next. # If it's not allowed, that'll error on next call. self.char_index -= 1 diff --git a/tests/test_keyvalues.py b/tests/test_keyvalues.py index cbb5bd94..23117359 100644 --- a/tests/test_keyvalues.py +++ b/tests/test_keyvalues.py @@ -120,7 +120,7 @@ def test_names() -> None: "Extra" "Spaces" // "Commented" "out" "Block" { - "Empty" + Empty { } } "Block" // "with value" @@ -133,7 +133,7 @@ def test_names() -> None: "Root2" { "Name with \\" in it" "Value with \\" inside" - "multiline" "text + multi+line "text \tcan continue for many \\"lines\\" of possibly indented @@ -151,8 +151,8 @@ def test_names() -> None: "Flag" "blocksthis" [!test_enabled] "Replaced" "shouldbe" - "Replaced" "toreplace" [test_enabled] - "Replaced" "alsothis" [test_enabled] + "Replaced" to+replace [test_enabled] + "Replaced" also+this [test_enabled] "Replaced" "shouldbe2" "Replaced" "toreplace2" [!test_disabled] @@ -165,8 +165,8 @@ def test_names() -> None: } "Replaced" [test_enabled] { - "lambda" "should" - "replace" "above" + lambda should + replace above } "Replaced" @@ -208,7 +208,7 @@ def test_names() -> None: ]), P('Root2', [ P('Name with " in it', 'Value with \" inside'), - P('multiline', + P('multi+line', 'text\n\tcan continue\nfor many "lines" of\n possibly indented\n\ntext' ), # Note, invalid = unchanged. @@ -219,8 +219,8 @@ def test_names() -> None: P('after ', 'value'), P('Flag', 'allowed'), P('FlagAllows', 'This'), - P('Replaced', 'toreplace'), - P('Replaced', 'alsothis'), + P('Replaced', 'to+replace'), + P('Replaced', 'also+this'), P('Replaced', 'toreplace2'), P('Replaced', 'alsothis2'), P('Replaced', [ @@ -284,7 +284,7 @@ def test_build() -> None: b.block('he\tre') with b.Root2: b['Name with " in it']('Value with \" inside') - b.multiline( + b['multi+line']( 'text\n\tcan continue\nfor many "lines" of\n possibly ' 'indented\n\ntext' ) @@ -297,8 +297,8 @@ def test_build() -> None: b['after ']('value') b.Flag('allowed') b.FlagAllows('This') - b.Replaced('toreplace') - b.Replaced('alsothis') + b.Replaced('to+replace') + b.Replaced('also+this') b.Replaced('toreplace2') b.Replaced('alsothis2') with b.Replaced: diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index ce6edb8c..5ca597d0 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -48,7 +48,7 @@ (T.STRING, "Root2"), T.NEWLINE, T.BRACE_OPEN, T.NEWLINE, (T.STRING, "Name with \" in it"), (T.STRING, "Value with \" inside"), T.NEWLINE, - (T.STRING, "multiline"), (T.STRING, 'text\n\tcan continue\nfor many "lines" of\n possibly indented\n\ntext'), T.NEWLINE, + (T.STRING, "multi+line"), (T.STRING, 'text\n\tcan continue\nfor many "lines" of\n possibly indented\n\ntext'), T.NEWLINE, (T.STRING, "Escapes"), (T.STRING, '\t \n \\d'), T.NEWLINE, (T.STRING, "Oneliner"), T.BRACE_OPEN, (T.STRING, 'name'), (T.STRING, 'value'), T.BRACE_CLOSE, T.NEWLINE, T.BRACE_CLOSE, T.NEWLINE, @@ -61,8 +61,8 @@ (T.STRING, "Flag"), (T.STRING, "blocksthis"), (T.PROP_FLAG, "!test_enabled"), T.NEWLINE, T.NEWLINE, (T.STRING, "Replaced"), (T.STRING, "shouldbe"), T.NEWLINE, - (T.STRING, "Replaced"), (T.STRING, "toreplace"), (T.PROP_FLAG, "test_enabled"), T.NEWLINE, - (T.STRING, "Replaced"), (T.STRING, "alsothis"), (T.PROP_FLAG, "test_enabled"), T.NEWLINE, + (T.STRING, "Replaced"), (T.STRING, "to+replace"), (T.PROP_FLAG, "test_enabled"), T.NEWLINE, + (T.STRING, "Replaced"), (T.STRING, "also+this"), (T.PROP_FLAG, "test_enabled"), T.NEWLINE, T.NEWLINE, (T.STRING, "Replaced"), (T.STRING, "shouldbe2"), T.NEWLINE, (T.STRING, "Replaced"), (T.STRING, "toreplace2"), (T.PROP_FLAG, "!test_disabled"), T.NEWLINE, @@ -108,7 +108,7 @@ #fimport test #EXclßÀde value\r #caseA\u0345\u03a3test -{ ]]{ }}}[[ {{] + = "test" + "ing" == vaLUE= 4+6 +{ ]]{ }}}[[ {{] = "test" "ing" == vaLUE= 4 6 """ noprop_parse_tokens = [ @@ -118,9 +118,9 @@ (T.DIRECTIVE, "exclssàde"), (T.STRING, "value"), T.NEWLINE, (T.DIRECTIVE, "casea\u03b9\u03c3test"), T.NEWLINE, T.BRACE_OPEN, T.BRACK_CLOSE, T.BRACK_CLOSE, T.BRACE_OPEN, T.BRACE_CLOSE, T.BRACE_CLOSE, T.BRACE_CLOSE, - T.BRACK_OPEN, T.BRACK_OPEN, T.BRACE_OPEN, T.BRACE_OPEN, T.BRACK_CLOSE, T.PLUS, - T.EQUALS, (T.STRING, "test"), T.PLUS, (T.STRING, "ing"), - T.EQUALS, T.EQUALS, (T.STRING, "vaLUE"), T.EQUALS, (T.STRING, "4"), T.PLUS, (T.STRING, "6"), T.NEWLINE + T.BRACK_OPEN, T.BRACK_OPEN, T.BRACE_OPEN, T.BRACE_OPEN, T.BRACK_CLOSE, + T.EQUALS, (T.STRING, "test"), (T.STRING, "ing"), + T.EQUALS, T.EQUALS, (T.STRING, "vaLUE"), T.EQUALS, (T.STRING, "4"), (T.STRING, "6"), T.NEWLINE ] @@ -281,7 +281,7 @@ def test_pushback_opvalues(py_c_token: Type[Tokenizer], token: Token, val: str) def test_call_next(py_c_token: Type[Tokenizer]) -> None: """Test that tok() functions, and it can be mixed with iteration.""" - tok: Tokenizer = py_c_token('''{ "test" } "test" { + } ''', 'file') + tok: Tokenizer = py_c_token('''{ "test" } "test" { = } ''', 'file') tok_type, tok_value = tok_tup = tok() assert tok_type is Token.BRACE_OPEN, tok_tup @@ -293,7 +293,7 @@ def test_call_next(py_c_token: Type[Tokenizer]) -> None: assert tok() == (Token.BRACE_CLOSE, '}') assert next(it1) == (Token.STRING, "test") assert next(it1) == (Token.BRACE_OPEN, '{') - assert tok() == (Token.PLUS, '+') + assert tok() == (Token.EQUALS, '=') # Another iterator doesn't restart. assert next(iter(tok)) == (Token.BRACE_CLOSE, '}') assert tok() == (Token.EOF, '') @@ -492,6 +492,7 @@ class SubStr(str): ('allow_star_comments', False), ('preserve_comments', False), ('colon_operator', False), + ('plus_operator', False), ]) def test_obj_config(py_c_token: Type[Tokenizer], parm: str, default: bool) -> None: """Test getting and setting configuration attributes.""" @@ -577,38 +578,45 @@ def test_brackets(py_c_token: Type[Tokenizer]) -> None: ]) -def test_colon_op(py_c_token: Type[Tokenizer]) -> None: - """Test : can be detected as a string or operator depending on the option.""" +@pytest.mark.parametrize('op, tok, option', [ + (':', Token.COLON, 'colon_operator'), + ('+', Token.PLUS, 'plus_operator'), +], ids=['colon', 'plus']) +def test_conditional_op(py_c_token: Type[Tokenizer], op: str, option: str, tok: Token) -> None: + """Test : and + can be detected as a string or operator depending on the option.""" + disabled = {option: False} + enabled = {option: True} + # Explicit string, unaffected. - check_tokens(py_c_token('"test:call"', colon_operator=False), [ - (Token.STRING, 'test:call'), + check_tokens(py_c_token(f'"test{op}call"', **disabled), [ + (Token.STRING, f'test{op}call'), ]) - check_tokens(py_c_token('"test:call"', colon_operator=True), [ - (Token.STRING, 'test:call'), + check_tokens(py_c_token(f'"test{op}call"', **enabled), [ + (Token.STRING, f'test{op}call'), ]) # Applies to bare strings, also note another char after. - check_tokens(py_c_token('test:call:{}', colon_operator=False), [ - (Token.STRING, 'test:call:'), + check_tokens(py_c_token('test%call%{}'.replace('%', op), **disabled), [ + (Token.STRING, f'test{op}call{op}'), Token.BRACE_OPEN, Token.BRACE_CLOSE, ]) - check_tokens(py_c_token('test:call:{}', colon_operator=True), [ + check_tokens(py_c_token('test%call%{}'.replace('%', op), **enabled), [ (Token.STRING, 'test'), - Token.COLON, + tok, (Token.STRING, 'call'), - Token.COLON, + tok, Token.BRACE_OPEN, Token.BRACE_CLOSE, ]) - # Test the string starting with a colon. - check_tokens(py_c_token('{:test:call}', colon_operator=False), [ + # Test the string starting with the character. + check_tokens(py_c_token('{%test%call}'.replace('%', op), **disabled), [ Token.BRACE_OPEN, - (Token.STRING, ':test:call'), + (Token.STRING, f'{op}test{op}call'), Token.BRACE_CLOSE, ]) - check_tokens(py_c_token('{:test:call}', colon_operator=True), [ - Token.BRACE_OPEN, Token.COLON, - (Token.STRING, 'test'), Token.COLON, + check_tokens(py_c_token('{%test%call}'.replace('%', op), **enabled), [ + Token.BRACE_OPEN, tok, + (Token.STRING, 'test'), tok, (Token.STRING, 'call'), Token.BRACE_CLOSE, ]) @@ -851,11 +859,11 @@ def test_block_iter(py_c_token: Type[Tokenizer]) -> None: assert list(py_c_token('{}').block('')) == [] # We can remove tokens halfway through on the original tokenizer. - tok = py_c_token(' { \n\n"legal" { + } block } ') + tok = py_c_token(' { \n\n"legal" { = } block } ') bl = tok.block("test") assert next(bl) == 'legal' assert tok() == (Token.BRACE_OPEN, '{') - assert tok() == (Token.PLUS, '+') + assert tok() == (Token.EQUALS, '=') assert tok() == (Token.BRACE_CLOSE, '}') assert next(bl) == 'block' with pytest.raises(StopIteration):