feat: Store actual info in Match objects; implement match, search methods (rudimentary)

tkarabela · tkarabela · commit 4c4aa4689566 · 2025-10-11T21:52:07.000+02:00
diff --git a/README.md b/README.md
@@ -13,8 +13,8 @@ import regex_automata
 
 pattern = regex_automata.compile(r"(foo)*bar|baz")  # regex_automata.Pattern
 
-pattern.fullmatch("foofoobar")  # regex_automata.Match(...)
-pattern.fullmatch("foo")  # None
+pattern.fullmatch("foofoobar")  # regex_automata.Match(span=(0, 9), match='foofoobar')
+pattern.fullmatch("foo")        # None
 
 pattern.ast  # regex_automata.parser.ast.AstNode
 pattern.nfa  # regex_automata.automata.nfa.NFA
@@ -35,14 +35,28 @@ Finite automaton accepting `(foo)*bar|baz`:
 ## Features compared to standard `re` module
 
 - Library
-  - `fullmatch()` method (but the `Match` object is currently just empty)
+  - `match()`, `fullmatch()` and `search()` methods (search is currently implemented naively via match)
+  - `Match` object containing span and matched text (but no groups)
   - flags `DOTALL` and `IGNORECASE`
 
 - Syntax
   - character sets: `.`, `[...]` (special sequences such as `\w` are not supported)
   - repetition: `*`, `?`, `+`, `{n,k}`
   - basic groups: `(...)` that behave like `(?:...)` ie. non-capturing
 
+## Implementation overview
+
+- Input pattern is tokenized via `regex_automata.parser.tokenizer.Tokenizer`
+  - Characters and sets are represented with `regex_automata.automata.rangeset.RangeSet`
+- List of tokens is processed by recursive descent parser `regex_automata.parser.parser.Parser`
+- Parser produces "raw" abstract syntax tree composed of `regex_automata.parser.ast.AstNode` nodes
+- AST is processed with `regex_automata.parser.ast_processor.ASTProcessor` to produce the final tree
+  - This is used to replace fancy repetition with primitives (union, concatenation, iteration)
+- Epsilon-free NFA is recursively constructed from the AST using `regex_automata.regex.nfa_builder.NFABuilder`
+- The processed pattern is stored in `regex_automata.regex.pattern.Pattern`, which is the high-level interface
+- When processing input text, the text and NFA are passed to `regex_automata.regex.nfa_evaluator.NFAEvaluator`
+- The evaluator produces `regex_automata.regex.match.Match` objects
+
 ## Grammar
 
 The recursive descent parser uses the following LL(1) grammar:
diff --git a/src/regex_automata/automata/nfa_evaluator.py b/src/regex_automata/automata/nfa_evaluator.py
diff --git a/src/regex_automata/regex/match.py b/src/regex_automata/regex/match.py
@@ -1,2 +1,15 @@
+from dataclasses import dataclass
+from typing import Self
+
+
+@dataclass
 class Match:
-    pass  # TODO
+    span: tuple[int, int]
+    match: str
+
+    @classmethod
+    def from_span_and_text(cls, start: int, end: int, text: str) -> Self:
+        return cls((start, end), text[start:end])
+
+    def group(self) -> str:
+        return self.match
diff --git a/src/regex_automata/regex/nfa_evaluator.py b/src/regex_automata/regex/nfa_evaluator.py
@@ -0,0 +1,48 @@
+from regex_automata.automata.nfa import NFA
+from regex_automata.regex.flags import PatternFlag
+from regex_automata.regex.match import Match
+
+
+class NFAEvaluator:
+    def __init__(self, nfa: NFA, flags: PatternFlag = PatternFlag.NOFLAG) -> None:
+        self.nfa = nfa
+        self.states: set[int] = self.nfa.epsilon_closure({nfa.initial_state})
+        self.flags = flags
+        self.initial_states = self.nfa.epsilon_closure({self.nfa.initial_state})
+        self.final_states = set(self.nfa.final_states)
+
+    def match(self, text: str, start: int = 0, end: int | None = None) -> Match | None:
+        if self.flags & PatternFlag.IGNORECASE:
+            text = text.lower()
+
+        end_ = end if end is not None else len(text)
+
+        entered_final = bool(self.states & self.final_states)
+        left_final = False
+
+        for i in range(start, end_):
+            c = text[i]
+            new_states = self.step(ord(c), self.states)
+            new_in_final = bool(new_states & self.final_states)
+            entered_final = entered_final or new_in_final
+            left_final = entered_final and not new_in_final
+
+            if left_final:
+                return Match.from_span_and_text(start, i, text)
+
+            self.states = new_states
+
+        if entered_final and not left_final:
+            return Match.from_span_and_text(start, end_, text)
+        else:
+            return None
+
+    def step(self, c: int, states: set[int]) -> set[int]:
+        new_states = set()
+        for u in states:
+            u_transitions = self.nfa.transitions.get(u, {})
+            for lrs, vs in u_transitions.items():
+                if c in lrs.set:
+                    new_states.update(vs)
+
+        return self.nfa.epsilon_closure(new_states)
diff --git a/src/regex_automata/regex/pattern.py b/src/regex_automata/regex/pattern.py
@@ -1,6 +1,6 @@
 from .flags import PatternFlag
 from .match import Match
-from regex_automata.automata.nfa_evaluator import NFAEvaluator
+from regex_automata.regex.nfa_evaluator import NFAEvaluator
 from ..errors import ParserError, PatternError, TokenizerError
 from ..parser.ast_processor import ASTProcessor
 from ..parser.ast_visualizer import ASTVisualizer
@@ -52,15 +52,22 @@ def render_ast(self, output_path: str = "ast.png", raw: bool = False) -> None:
         ast = self.ast if not raw else self.raw_ast
         ASTVisualizer(ast).render(output_path)
 
-    def fullmatch(self, s: str) -> Match | None:
-        evaluator = NFAEvaluator(self.nfa, self.flags)
-        if evaluator.accepts(s):
-            return Match()
-        else:
-            return None
+    def fullmatch(self, text: str, start: int = 0, end: int | None = None) -> Match | None:
+        end_ = end if end is not None else len(text)
+        m = self.match(text, start, end)
+        if m is not None and m.span[-1] != end_:
+            m = None
+        return m
 
-    def match(self, s: str) -> Match | None:
-        raise NotImplementedError  # TODO
+    def match(self, text: str, start: int = 0, end: int | None = None) -> Match | None:
+        evaluator = NFAEvaluator(self.nfa, self.flags)
+        return evaluator.match(text, start, end)
 
-    def search(self, s: str) -> Match | None:
-        raise NotImplementedError  # TODO
+    def search(self, text: str, start: int = 0, end: int | None = None) -> Match | None:
+        # TODO implement this properly via automaton
+        end_ = end if end is not None else len(text)
+        for i in range(start, end_):
+            m = self.match(text, i, end)
+            if m is not None:
+                return m
+        return None
diff --git a/tests/test_nfa.py b/tests/test_nfa.py
@@ -3,13 +3,14 @@
 import pytest
 
 from regex_automata.automata.nfa import NFA
-from regex_automata.automata.nfa_evaluator import NFAEvaluator
+from regex_automata.regex.nfa_evaluator import NFAEvaluator
 
 DATA_DIR = PurePath(__file__).parent / "data"
 
 def _accepts(nfa: NFA, s: str) -> bool:
     evaluator = NFAEvaluator(nfa)
-    return evaluator.accepts(s)
+    m = evaluator.match(s)
+    return m is not None and m.match == s
 
 
 def _test_lol_strings(nfa: NFA):
diff --git a/tests/test_regex.py b/tests/test_regex.py
@@ -82,3 +82,25 @@ def test_digits():
 
     assert not p.fullmatch("60")
     assert not p.fullmatch("01")
+
+
+def test_match():
+    p1 = regex_automata.compile(r"a{3}")
+    m = p1.match("aaa")
+    assert m is not None and m.span == (0, 3)
+    m = p1.match("baaa")
+    assert m is None
+    m = p1.match("baaa", start=1)
+    assert m is not None and m.span == (1, 4)
+
+    p2 = regex_automata.compile(r"a+")
+    m = p2.match("aaaaaaaaaaaaaaaaaa", start=5, end=7)
+    assert m is not None and m.span == (5, 7)
+
+
+def test_search():
+    p1 = regex_automata.compile(r"[a-z0-9]+@[a-z0-9]+\.[a-z0-9]+")
+    m = p1.search("text abc@def.com xyz@123.com")
+    assert m is not None and m.match == "abc@def.com"
+    m = p1.search("text abc@def.com xyz@123.com", start=10)
+    assert m is not None and m.match == "xyz@123.com"