Avoid escaping / and ?, these are unambiguous.

TeamSpen210 · TeamSpen210 · commit 642ed1f139c7 · 2024-11-12T07:23:16.000+10:00
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -9,6 +9,7 @@ Changelog
 Version (dev)
 -------------
 * #35: Only use character escapes in FGD custom syntax mode. The original parser only allows `\\n`.
+* Avoid escaping `/` and `?`, these are unambiguous.
 
 -------------
 Version 2.4.1
diff --git a/src/srctools/_tokenizer.pyx b/src/srctools/_tokenizer.pyx
@@ -1071,7 +1071,8 @@ def escape_text(str text not None: str) -> str:
     r"""Escape special characters and backslashes, so tokenising reproduces them.
 
     This matches utilbuffer.cpp in the SDK.
-    The following characters are escaped: \n, \t, \v, \b, \r, \f, \a, \, /, ?, ', ".
+    The following characters are escaped: \n, \t, \v, \b, \r, \f, \a, \, ', ".
+    / and ? are accepted as escapes, but not produced since they're unambiguous.
     """
     # UTF8 = ASCII for the chars we care about, so we can just loop over the
     # UTF8 data.
@@ -1114,8 +1115,6 @@ def escape_text(str text not None: str) -> str:
                 j = _write_escape(out_buff, j, b'f')
             elif letter == b'\a':
                 j = _write_escape(out_buff, j, b'a')
-            elif letter == b'?':
-                j = _write_escape(out_buff, j, b'?')
             elif letter == b'\\':
                 j = _write_escape(out_buff, j, b'\\')
             elif letter == b'"':
diff --git a/src/srctools/tokenizer.py b/src/srctools/tokenizer.py
@@ -17,6 +17,7 @@
 Character escapes match matches `utilbuffer.cpp <https://github.com/ValveSoftware/source-sdk-2013/blob/0d8dceea4310fde5706b3ce1c70609d72a38efdf/sp/src/tier1/utlbuffer.cpp#L57-L69>`_ in the SDK.
 Specifically, the following characters are escaped:
 `\\\\n`, `\\\\t`, `\\\\v`, `\\\\b`, `\\\\r`, `\\\\f`, `\\\\a`, `\\`, `?`, `'` and `"`.
+`/` and `?` are accepted as escapes, but not produced since they're unambiguous.
 """
 import re
 from typing import (
@@ -177,7 +178,10 @@ def has_value(self) -> bool:
     '?': '?',
 }
 ESCAPES_INV = {char: f'\\{sym}' for sym, char in ESCAPES.items()}
-ESCAPE_RE = re.compile('|'.join(map(re.escape, ESCAPES_INV)))
+ESCAPE_RE = re.compile('|'.join(
+    re.escape(c) for c in ESCAPES_INV
+    if c not in '?/'
+))
 
 #: Characters not allowed for bare strings. These must be quoted.
 BARE_DISALLOWED: Final = frozenset('"\'{};,=[]()\r\n\t ')
diff --git a/tests/test_fgd/test_export_regressions_vanilla_.fgd b/tests/test_fgd/test_export_regressions_vanilla_.fgd
@@ -17,7 +17,8 @@
 = npc_test: "Entity description, extending beyond 1000 characters: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, " +
 		"211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, " +
 		"411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499. Done!\n escapes: tab=	, newline=\n, ''quoted'', bell="
-	[	test_kv(color255) : "A test keyvalue" : "255 255 128" : "Help text for a keyvalue"
+	[
+	test_kv(color255) : "A test keyvalue" : "255 255 128" : "Help text for a keyvalue"
 	spawnflags(flags)  =
 		[
 		1: "[1] A" : 0
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
@@ -19,8 +19,8 @@
 IS_CPYTHON = platform.python_implementation() == 'CPython'
 
 # See https://github.com/ValveSoftware/source-sdk-2013/blob/0d8dceea4310fde5706b3ce1c70609d72a38efdf/sp/src/tier1/utlbuffer.cpp#L57-L69
-ESCAPE_CHARS = "\n \t \v \b \r \f \a \\ ? \' \""
-ESCAPE_ENCODED = r"\n \t \v \b \r \f \a \\ \? \' " + r'\"'
+ESCAPE_CHARS = "\n \t \v \b \r \f \a \\ \' \""
+ESCAPE_ENCODED = r"\n \t \v \b \r \f \a \\ \' " + r'\"'
 
 # The correct result of parsing prop_parse_test.
 # Either the token, or token + value (which must be correct).
@@ -535,6 +535,7 @@ def test_obj_config(py_c_token: Type[Tokenizer], parm: str, default: bool) -> No
     ("\\thello_world", r"\\thello_world"),
     ("\\ttest\nvalue\t\\r\t\n", r"\\ttest\nvalue\t\\r\t\n"),
     (ESCAPE_CHARS, ESCAPE_ENCODED),
+    ('Unchanged ?/', 'Unchanged ?/'),
     # BMP characters, and some multiplane chars.
     ('test: ╒══╕', r'test: ╒══╕'),
     ("♜♞🤐♝♛🥌 chess: ♚♝♞♜", "♜♞🤐♝♛🥌 chess: ♚♝♞♜"),
@@ -685,12 +686,13 @@ def test_allow_escapes(py_c_token: Type[Tokenizer]) -> None:
         Token.BRACE_CLOSE,
     ])
     check_tokens(py_c_token(
-        f'{{ "string\\n" "all escapes: {ESCAPE_ENCODED}" }}',
+        f'{{ "string\\n" "all escapes: {ESCAPE_ENCODED}" "also accepted /?" }}',
         allow_escapes=True,
     ), [
         Token.BRACE_OPEN,
         (Token.STRING, "string\n"),
         (Token.STRING, f'all escapes: {ESCAPE_CHARS}'),
+        (Token.STRING, 'also accepted /?'),
         Token.BRACE_CLOSE,
     ])