|
3 | 3 | from .tokens import Token, LPar, RPar, Repetition, Pipe, CharacterSet, BoundaryAssertion, BoundaryAssertionSemantic |
4 | 4 | from ..automata.rangeset import RangeSet, WORD_RANGESET, NONWORD_RANGESET, DIGIT_RANGESET, NONDIGIT_RANGESET, \ |
5 | 5 | WHITESPACE_RANGESET, NONWHITESPACE_RANGESET |
6 | | -from ..errors import TokenizerError, UnsupportedSyntaxError |
| 6 | +from ..errors import TokenizerError, UnsupportedSyntaxError, RegexAutomataError |
7 | 7 | from ..regex.flags import PatternFlag |
8 | 8 |
|
9 | 9 |
|
10 | 10 | class Tokenizer: |
11 | 11 | class Reader: |
12 | 12 | def __init__(self, tokenizer: "Tokenizer") -> None: |
13 | 13 | self.tokenizer = tokenizer |
| 14 | + self.tokenizer_initial_pos = tokenizer.pos |
14 | 15 | self.start = tokenizer.pos + 1 |
15 | 16 | self.end = self.start |
16 | 17 |
|
@@ -42,6 +43,14 @@ def span(self) -> tuple[int, int]: |
42 | 43 | def text(self) -> str: |
43 | 44 | return self.tokenizer.text[self.start:self.end] |
44 | 45 |
|
| 46 | + def reset(self) -> None: |
| 47 | + self.tokenizer.pos = self.tokenizer_initial_pos |
| 48 | + self.start = self.tokenizer.pos + 1 |
| 49 | + self.end = self.start |
| 50 | + |
| 51 | + class FailedToReadBraceRepetition(Exception): |
| 52 | + pass |
| 53 | + |
45 | 54 | def __init__(self, text: str, flags: PatternFlag = PatternFlag.NOFLAG) -> None: |
46 | 55 | self.text = text |
47 | 56 | self.flags = flags |
@@ -88,7 +97,11 @@ def get_tokens(self) -> Iterator[Token]: |
88 | 97 | case ")": |
89 | 98 | yield self.read_RPar(reader) |
90 | 99 | case "*" | "?" | "+" | "{": |
91 | | - yield self.read_Repetition(reader) |
| 100 | + try: |
| 101 | + yield self.read_Repetition(reader) |
| 102 | + except self.FailedToReadBraceRepetition: |
| 103 | + reader.reset() |
| 104 | + yield self.read_CharacterSet(reader) |
92 | 105 | case "|": |
93 | 106 | yield self.read_Pipe(reader) |
94 | 107 | case ".": |
@@ -132,45 +145,47 @@ def read_Repetition(self, reader: Reader) -> Repetition: |
132 | 145 | self.error(f"{c}{c2} quantifier is not supported", unsupported=True) |
133 | 146 | return Repetition(reader.span, reader.text, 1, None) |
134 | 147 | case "{": |
135 | | - read_lower_limit = False |
136 | | - c = self.peek() |
137 | | - if c is None: |
138 | | - self.error("bad repetition definition") |
139 | | - elif c == ",": |
140 | | - rmin = 0 |
141 | | - elif c.isdigit(): |
142 | | - rmin = reader.read_number() |
143 | | - read_lower_limit = True |
144 | | - else: |
145 | | - self.error("bad repetition definition") |
146 | | - |
147 | | - c = self.peek() |
148 | | - if c is None: |
149 | | - self.error("bad repetition definition") |
150 | | - elif c == ",": |
151 | | - reader.read(",") |
152 | | - |
| 148 | + try: |
| 149 | + read_lower_limit = False |
153 | 150 | c = self.peek() |
154 | | - if c == "}": |
155 | | - rmax = None |
156 | | - elif c is not None and c.isdigit(): |
157 | | - rmax = reader.read_number() |
| 151 | + if c is None: |
| 152 | + self.error("bad repetition definition") |
| 153 | + elif c == ",": |
| 154 | + rmin = 0 |
| 155 | + elif c.isdigit(): |
| 156 | + rmin = reader.read_number() |
| 157 | + read_lower_limit = True |
158 | 158 | else: |
159 | 159 | self.error("bad repetition definition") |
160 | | - elif c == "}": |
161 | | - if not read_lower_limit: |
162 | | - self.error("bad repetition definition (braced definition missing both limits)") |
163 | | - rmax = rmin |
164 | | - else: |
165 | | - self.error("bad repetition definition") |
166 | 160 |
|
167 | | - reader.read("}") |
| 161 | + c = self.peek() |
| 162 | + if c is None: |
| 163 | + self.error("bad repetition definition") |
| 164 | + elif c == ",": |
| 165 | + reader.read(",") |
| 166 | + |
| 167 | + c = self.peek() |
| 168 | + if c == "}": |
| 169 | + rmax = None |
| 170 | + elif c is not None and c.isdigit(): |
| 171 | + rmax = reader.read_number() |
| 172 | + else: |
| 173 | + self.error("bad repetition definition") |
| 174 | + elif c == "}": |
| 175 | + if not read_lower_limit: |
| 176 | + self.error("bad repetition definition (braced definition missing both limits)") |
| 177 | + rmax = rmin |
| 178 | + else: |
| 179 | + self.error("bad repetition definition") |
168 | 180 |
|
169 | | - if (c2 := self.peek()) in ("?", "+"): |
170 | | - self.error(f"{{...}}{c2} quantifier is not supported", unsupported=True) |
| 181 | + reader.read("}") |
171 | 182 |
|
172 | | - return Repetition(reader.span, reader.text, rmin, rmax) |
| 183 | + if (c2 := self.peek()) in ("?", "+"): |
| 184 | + self.error(f"{{...}}{c2} quantifier is not supported", unsupported=True) |
173 | 185 |
|
| 186 | + return Repetition(reader.span, reader.text, rmin, rmax) |
| 187 | + except RegexAutomataError as e: |
| 188 | + raise self.FailedToReadBraceRepetition() from e |
174 | 189 | case _: |
175 | 190 | self.error("bad repetition definition") |
176 | 191 |
|
|
0 commit comments