Skip to content

Commit e4a7d1e

Browse files
committed
fix: Allow braces in non-repetition context
1 parent 4e81966 commit e4a7d1e

File tree

3 files changed

+57
-36
lines changed

3 files changed

+57
-36
lines changed

src/regex_automata/parser/tokenizer.py

Lines changed: 49 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,15 @@
33
from .tokens import Token, LPar, RPar, Repetition, Pipe, CharacterSet, BoundaryAssertion, BoundaryAssertionSemantic
44
from ..automata.rangeset import RangeSet, WORD_RANGESET, NONWORD_RANGESET, DIGIT_RANGESET, NONDIGIT_RANGESET, \
55
WHITESPACE_RANGESET, NONWHITESPACE_RANGESET
6-
from ..errors import TokenizerError, UnsupportedSyntaxError
6+
from ..errors import TokenizerError, UnsupportedSyntaxError, RegexAutomataError
77
from ..regex.flags import PatternFlag
88

99

1010
class Tokenizer:
1111
class Reader:
1212
def __init__(self, tokenizer: "Tokenizer") -> None:
1313
self.tokenizer = tokenizer
14+
self.tokenizer_initial_pos = tokenizer.pos
1415
self.start = tokenizer.pos + 1
1516
self.end = self.start
1617

@@ -42,6 +43,14 @@ def span(self) -> tuple[int, int]:
4243
def text(self) -> str:
4344
return self.tokenizer.text[self.start:self.end]
4445

46+
def reset(self) -> None:
47+
self.tokenizer.pos = self.tokenizer_initial_pos
48+
self.start = self.tokenizer.pos + 1
49+
self.end = self.start
50+
51+
class FailedToReadBraceRepetition(Exception):
52+
pass
53+
4554
def __init__(self, text: str, flags: PatternFlag = PatternFlag.NOFLAG) -> None:
4655
self.text = text
4756
self.flags = flags
@@ -88,7 +97,11 @@ def get_tokens(self) -> Iterator[Token]:
8897
case ")":
8998
yield self.read_RPar(reader)
9099
case "*" | "?" | "+" | "{":
91-
yield self.read_Repetition(reader)
100+
try:
101+
yield self.read_Repetition(reader)
102+
except self.FailedToReadBraceRepetition:
103+
reader.reset()
104+
yield self.read_CharacterSet(reader)
92105
case "|":
93106
yield self.read_Pipe(reader)
94107
case ".":
@@ -132,45 +145,47 @@ def read_Repetition(self, reader: Reader) -> Repetition:
132145
self.error(f"{c}{c2} quantifier is not supported", unsupported=True)
133146
return Repetition(reader.span, reader.text, 1, None)
134147
case "{":
135-
read_lower_limit = False
136-
c = self.peek()
137-
if c is None:
138-
self.error("bad repetition definition")
139-
elif c == ",":
140-
rmin = 0
141-
elif c.isdigit():
142-
rmin = reader.read_number()
143-
read_lower_limit = True
144-
else:
145-
self.error("bad repetition definition")
146-
147-
c = self.peek()
148-
if c is None:
149-
self.error("bad repetition definition")
150-
elif c == ",":
151-
reader.read(",")
152-
148+
try:
149+
read_lower_limit = False
153150
c = self.peek()
154-
if c == "}":
155-
rmax = None
156-
elif c is not None and c.isdigit():
157-
rmax = reader.read_number()
151+
if c is None:
152+
self.error("bad repetition definition")
153+
elif c == ",":
154+
rmin = 0
155+
elif c.isdigit():
156+
rmin = reader.read_number()
157+
read_lower_limit = True
158158
else:
159159
self.error("bad repetition definition")
160-
elif c == "}":
161-
if not read_lower_limit:
162-
self.error("bad repetition definition (braced definition missing both limits)")
163-
rmax = rmin
164-
else:
165-
self.error("bad repetition definition")
166160

167-
reader.read("}")
161+
c = self.peek()
162+
if c is None:
163+
self.error("bad repetition definition")
164+
elif c == ",":
165+
reader.read(",")
166+
167+
c = self.peek()
168+
if c == "}":
169+
rmax = None
170+
elif c is not None and c.isdigit():
171+
rmax = reader.read_number()
172+
else:
173+
self.error("bad repetition definition")
174+
elif c == "}":
175+
if not read_lower_limit:
176+
self.error("bad repetition definition (braced definition missing both limits)")
177+
rmax = rmin
178+
else:
179+
self.error("bad repetition definition")
168180

169-
if (c2 := self.peek()) in ("?", "+"):
170-
self.error(f"{{...}}{c2} quantifier is not supported", unsupported=True)
181+
reader.read("}")
171182

172-
return Repetition(reader.span, reader.text, rmin, rmax)
183+
if (c2 := self.peek()) in ("?", "+"):
184+
self.error(f"{{...}}{c2} quantifier is not supported", unsupported=True)
173185

186+
return Repetition(reader.span, reader.text, rmin, rmax)
187+
except RegexAutomataError as e:
188+
raise self.FailedToReadBraceRepetition() from e
174189
case _:
175190
self.error("bad repetition definition")
176191

tests/test_parser.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,12 @@ def test_parse_tree_union_parens():
4343
)
4444

4545

46-
@pytest.mark.parametrize("pattern", ["\\", "[a", "[a-bc-", "[]",
47-
"{", "{123", "{123,", "{,123", "{,123,}", "{123,456,}"])
46+
@pytest.mark.parametrize("pattern", ["\\", "[a", "[a-bc-", "[]"])
4847
def test_tokenizer_errors_in_pattern_malformed(pattern):
4948
with pytest.raises(TokenizerError):
5049
list(Tokenizer(pattern).get_tokens())
50+
51+
52+
@pytest.mark.parametrize("pattern", ["{", "{123", "{123,", "{,123", "{,123,}", "{123,456,}"])
53+
def test_tokenizer_errors_in_pattern_nonmalformed(pattern):
54+
list(Tokenizer(pattern).get_tokens())

tests/test_regex.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@
4646
# ("a||c|", "", True),
4747
# ("a||c|", "a", True),
4848
# ("a||c|", "c", True),
49+
("{[^}]*}", "{foo bar}", True),
50+
("x{3}{[^}]*}{3}", "xxx{foo bar}}}", True),
4951
])
5052
def test_fullmatch_regex(pattern: str, s: str, result: bool):
5153
assert (regex_automata.fullmatch(pattern, s) is not None) is result

0 commit comments

Comments
 (0)