Skip to content

Commit 4c4aa46

Browse files
committed
feat: Store actual info in Match objects; implement match, search methods (rudimentary)
1 parent 07471c1 commit 4c4aa46

File tree

7 files changed

+122
-44
lines changed

7 files changed

+122
-44
lines changed

README.md

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ import regex_automata
1313

1414
pattern = regex_automata.compile(r"(foo)*bar|baz") # regex_automata.Pattern
1515

16-
pattern.fullmatch("foofoobar") # regex_automata.Match(...)
17-
pattern.fullmatch("foo") # None
16+
pattern.fullmatch("foofoobar") # regex_automata.Match(span=(0, 9), match='foofoobar')
17+
pattern.fullmatch("foo") # None
1818

1919
pattern.ast # regex_automata.parser.ast.AstNode
2020
pattern.nfa # regex_automata.automata.nfa.NFA
@@ -35,14 +35,28 @@ Finite automaton accepting `(foo)*bar|baz`:
3535
## Features compared to standard `re` module
3636

3737
- Library
38-
- `fullmatch()` method (but the `Match` object is currently just empty)
38+
- `match()`, `fullmatch()` and `search()` methods (search is currently implemented naively via match)
39+
- `Match` object containing span and matched text (but no groups)
3940
- flags `DOTALL` and `IGNORECASE`
4041

4142
- Syntax
4243
- character sets: `.`, `[...]` (special sequences such as `\w` are not supported)
4344
- repetition: `*`, `?`, `+`, `{n,k}`
4445
- basic groups: `(...)` that behave like `(?:...)` ie. non-capturing
4546

47+
## Implementation overview
48+
49+
- Input pattern is tokenized via `regex_automata.parser.tokenizer.Tokenizer`
50+
- Characters and sets are represented with `regex_automata.automata.rangeset.RangeSet`
51+
- List of tokens is processed by recursive descent parser `regex_automata.parser.parser.Parser`
52+
- Parser produces "raw" abstract syntax tree composed of `regex_automata.parser.ast.AstNode` nodes
53+
- AST is processed with `regex_automata.parser.ast_processor.ASTProcessor` to produce the final tree
54+
- This is used to replace fancy repetition with primitives (union, concatenation, iteration)
55+
- Epsilon-free NFA is recursively constructed from the AST using `regex_automata.regex.nfa_builder.NFABuilder`
56+
- The processed pattern is stored in `regex_automata.regex.pattern.Pattern`, which is the high-level interface
57+
- When processing input text, the text and NFA are passed to `regex_automata.regex.nfa_evaluator.NFAEvaluator`
58+
- The evaluator produces `regex_automata.regex.match.Match` objects
59+
4660
## Grammar
4761

4862
The recursive descent parser uses the following LL(1) grammar:

src/regex_automata/automata/nfa_evaluator.py

Lines changed: 0 additions & 27 deletions
This file was deleted.

src/regex_automata/regex/match.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,15 @@
1+
from dataclasses import dataclass
2+
from typing import Self
3+
4+
5+
@dataclass
16
class Match:
2-
pass # TODO
7+
span: tuple[int, int]
8+
match: str
9+
10+
@classmethod
11+
def from_span_and_text(cls, start: int, end: int, text: str) -> Self:
12+
return cls((start, end), text[start:end])
13+
14+
def group(self) -> str:
15+
return self.match
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
from regex_automata.automata.nfa import NFA
2+
from regex_automata.regex.flags import PatternFlag
3+
from regex_automata.regex.match import Match
4+
5+
6+
class NFAEvaluator:
7+
def __init__(self, nfa: NFA, flags: PatternFlag = PatternFlag.NOFLAG) -> None:
8+
self.nfa = nfa
9+
self.states: set[int] = self.nfa.epsilon_closure({nfa.initial_state})
10+
self.flags = flags
11+
self.initial_states = self.nfa.epsilon_closure({self.nfa.initial_state})
12+
self.final_states = set(self.nfa.final_states)
13+
14+
def match(self, text: str, start: int = 0, end: int | None = None) -> Match | None:
15+
if self.flags & PatternFlag.IGNORECASE:
16+
text = text.lower()
17+
18+
end_ = end if end is not None else len(text)
19+
20+
entered_final = bool(self.states & self.final_states)
21+
left_final = False
22+
23+
for i in range(start, end_):
24+
c = text[i]
25+
new_states = self.step(ord(c), self.states)
26+
new_in_final = bool(new_states & self.final_states)
27+
entered_final = entered_final or new_in_final
28+
left_final = entered_final and not new_in_final
29+
30+
if left_final:
31+
return Match.from_span_and_text(start, i, text)
32+
33+
self.states = new_states
34+
35+
if entered_final and not left_final:
36+
return Match.from_span_and_text(start, end_, text)
37+
else:
38+
return None
39+
40+
def step(self, c: int, states: set[int]) -> set[int]:
41+
new_states = set()
42+
for u in states:
43+
u_transitions = self.nfa.transitions.get(u, {})
44+
for lrs, vs in u_transitions.items():
45+
if c in lrs.set:
46+
new_states.update(vs)
47+
48+
return self.nfa.epsilon_closure(new_states)

src/regex_automata/regex/pattern.py

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from .flags import PatternFlag
22
from .match import Match
3-
from regex_automata.automata.nfa_evaluator import NFAEvaluator
3+
from regex_automata.regex.nfa_evaluator import NFAEvaluator
44
from ..errors import ParserError, PatternError, TokenizerError
55
from ..parser.ast_processor import ASTProcessor
66
from ..parser.ast_visualizer import ASTVisualizer
@@ -52,15 +52,22 @@ def render_ast(self, output_path: str = "ast.png", raw: bool = False) -> None:
5252
ast = self.ast if not raw else self.raw_ast
5353
ASTVisualizer(ast).render(output_path)
5454

55-
def fullmatch(self, s: str) -> Match | None:
56-
evaluator = NFAEvaluator(self.nfa, self.flags)
57-
if evaluator.accepts(s):
58-
return Match()
59-
else:
60-
return None
55+
def fullmatch(self, text: str, start: int = 0, end: int | None = None) -> Match | None:
56+
end_ = end if end is not None else len(text)
57+
m = self.match(text, start, end)
58+
if m is not None and m.span[-1] != end_:
59+
m = None
60+
return m
6161

62-
def match(self, s: str) -> Match | None:
63-
raise NotImplementedError # TODO
62+
def match(self, text: str, start: int = 0, end: int | None = None) -> Match | None:
63+
evaluator = NFAEvaluator(self.nfa, self.flags)
64+
return evaluator.match(text, start, end)
6465

65-
def search(self, s: str) -> Match | None:
66-
raise NotImplementedError # TODO
66+
def search(self, text: str, start: int = 0, end: int | None = None) -> Match | None:
67+
# TODO implement this properly via automaton
68+
end_ = end if end is not None else len(text)
69+
for i in range(start, end_):
70+
m = self.match(text, i, end)
71+
if m is not None:
72+
return m
73+
return None

tests/test_nfa.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,14 @@
33
import pytest
44

55
from regex_automata.automata.nfa import NFA
6-
from regex_automata.automata.nfa_evaluator import NFAEvaluator
6+
from regex_automata.regex.nfa_evaluator import NFAEvaluator
77

88
DATA_DIR = PurePath(__file__).parent / "data"
99

1010
def _accepts(nfa: NFA, s: str) -> bool:
1111
evaluator = NFAEvaluator(nfa)
12-
return evaluator.accepts(s)
12+
m = evaluator.match(s)
13+
return m is not None and m.match == s
1314

1415

1516
def _test_lol_strings(nfa: NFA):

tests/test_regex.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,3 +82,25 @@ def test_digits():
8282

8383
assert not p.fullmatch("60")
8484
assert not p.fullmatch("01")
85+
86+
87+
def test_match():
88+
p1 = regex_automata.compile(r"a{3}")
89+
m = p1.match("aaa")
90+
assert m is not None and m.span == (0, 3)
91+
m = p1.match("baaa")
92+
assert m is None
93+
m = p1.match("baaa", start=1)
94+
assert m is not None and m.span == (1, 4)
95+
96+
p2 = regex_automata.compile(r"a+")
97+
m = p2.match("aaaaaaaaaaaaaaaaaa", start=5, end=7)
98+
assert m is not None and m.span == (5, 7)
99+
100+
101+
def test_search():
102+
p1 = regex_automata.compile(r"[a-z0-9]+@[a-z0-9]+\.[a-z0-9]+")
103+
m = p1.search("text abc@def.com xyz@123.com")
104+
assert m is not None and m.match == "abc@def.com"
105+
m = p1.search("text abc@def.com xyz@123.com", start=10)
106+
assert m is not None and m.match == "xyz@123.com"

0 commit comments

Comments
 (0)