From 2f6f45bb7558d3f0c27f7ac1465f097e63880003 Mon Sep 17 00:00:00 2001
From: madara88645 <163588475+madara88645@users.noreply.github.com>
Date: Tue, 17 Mar 2026 09:30:55 +0000
Subject: [PATCH 1/3] Perf: pre-compile regexes in pick_persona heuristic

Pre-compile the PERSONA_KEYWORDS regex dictionary into a module-level
`_COMPILED_PERSONA_RX` to avoid regex parsing overhead inside the
hot loop of `pick_persona`. Use `p.pattern` in evidence to preserve
compatibility.
---
 .jules/bolt.md             |  4 ++++
 app/heuristics/__init__.py | 12 +++++++++---
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/.jules/bolt.md b/.jules/bolt.md
index 2e5e816..b957acd 100644
--- a/.jules/bolt.md
+++ b/.jules/bolt.md
@@ -5,3 +5,7 @@
 ## 2024-06-20 - Optimizing SQLite JSON Deserialization Cache Size
 **Learning:** In the backend RAG implementation (`app/rag/simple_index.py`), embeddings are stored as JSON strings in an SQLite database. I initially attempted to optimize repeated deserialization by adding an `lru_cache` bounded to `maxsize=1024` for parsing these strings. However, because similarity searches involve a linear scan over all database chunks, if the database has more than 1,024 chunks, the cache is completely evicted during a single scan, resulting in a 0% cache hit rate on subsequent searches (cache thrashing).
 **Action:** When caching objects that are iterated over sequentially during database table scans, ensure the cache boundary is sized large enough (e.g., `maxsize=65536`) to accommodate the entire dataset or use an unbounded cache if safe, otherwise the caching mechanism will only add overhead.
+
+## 2025-03-08 - Optimizing Heuristic Regex Searches
+**Learning:** In hot loops evaluating multiple regex patterns (e.g., categorizing user personas in `pick_persona`), iterating over raw string patterns and calling `re.search(p, text)` incurs significant overhead because Python must parse and compile the regex on every call, bypassing the limits of the internal regex cache when many distinct patterns are used.
+**Action:** When a heuristic function evaluates a large dictionary or list of constant string patterns on every invocation, pre-compile the patterns into a module-level dictionary (`_COMPILED_PERSONA_RX`) and iterate over the pre-compiled objects (`p.search(text)`). This reduces overhead significantly, avoiding repeated parsing and cache evictions. To maintain backwards compatibility when returning matched evidence, append `p.pattern` to the evidence list instead of the compiled object.
diff --git a/app/heuristics/__init__.py b/app/heuristics/__init__.py
index 4844442..fdf8623 100644
--- a/app/heuristics/__init__.py
+++ b/app/heuristics/__init__.py
@@ -195,15 +195,21 @@
 }
 
 
+_COMPILED_PERSONA_RX = {
+    persona: [re.compile(p) for p in pats]
+    for persona, pats in PERSONA_KEYWORDS.items()
+}
+
+
 def pick_persona(text: str) -> tuple[str, dict]:
     lower = text.lower()
     scores = {k: 0 for k in PERSONA_KEYWORDS}
     evidence: dict[str, list[str]] = {k: [] for k in PERSONA_KEYWORDS}
-    for persona, pats in PERSONA_KEYWORDS.items():
+    for persona, pats in _COMPILED_PERSONA_RX.items():
         for p in pats:
-            if re.search(p, lower):
+            if p.search(lower):
                 scores[persona] += 1
-                evidence[persona].append(p)
+                evidence[persona].append(p.pattern)
     # choose highest score, tie -> deterministic alphabetical order of persona key
     ranked = sorted(scores.items(), key=lambda x: (-x[1], x[0]))
     if ranked and ranked[0][1] > 0:

From 1ccd1b002a060dcc65743ea5c12ee9fa22bf39ac Mon Sep 17 00:00:00 2001
From: madara88645 <163588475+madara88645@users.noreply.github.com>
Date: Tue, 17 Mar 2026 09:34:49 +0000
Subject: [PATCH 2/3] Perf: pre-compile regexes in pick_persona heuristic

Pre-compile the PERSONA_KEYWORDS regex dictionary into a module-level
`_COMPILED_PERSONA_RX` to avoid regex parsing overhead inside the
hot loop of `pick_persona`. Use `p.pattern` in evidence to preserve
compatibility. Also fixed formatting to pass CI.
---
 app/heuristics/__init__.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/app/heuristics/__init__.py b/app/heuristics/__init__.py
index fdf8623..a39e76e 100644
--- a/app/heuristics/__init__.py
+++ b/app/heuristics/__init__.py
@@ -196,8 +196,7 @@
 
 
 _COMPILED_PERSONA_RX = {
-    persona: [re.compile(p) for p in pats]
-    for persona, pats in PERSONA_KEYWORDS.items()
+    persona: [re.compile(p) for p in pats] for persona, pats in PERSONA_KEYWORDS.items()
 }
 
 
@@ -870,9 +869,9 @@ def extract_inputs(text: str, lang: str) -> Dict[str, str]:
             unit = cur3 or cur1 or cur2 or ""
             if cur1 or cur2 or cur3:
                 if v2:
-                    inputs[
-                        "budget_hint"
-                    ] = f"{_normalize_currency(v1)}-{_normalize_currency(v2)} {unit}".strip()
+                    inputs["budget_hint"] = (
+                        f"{_normalize_currency(v1)}-{_normalize_currency(v2)} {unit}".strip()
+                    )
                 else:
                     inputs["budget_hint"] = f"{_normalize_currency(v1)} {unit}".strip()
 

From abbda55ca87618875ac73251751b6e339aa7fcee Mon Sep 17 00:00:00 2001
From: madara88645 <163588475+madara88645@users.noreply.github.com>
Date: Tue, 17 Mar 2026 09:37:42 +0000
Subject: [PATCH 3/3] Perf: pre-compile regexes in pick_persona heuristic

Pre-compile the PERSONA_KEYWORDS regex dictionary into a module-level
`_COMPILED_PERSONA_RX` to avoid regex parsing overhead inside the
hot loop of `pick_persona`. Use `p.pattern` in evidence to preserve
compatibility.

Ran `pre-commit run --all-files` to apply correct ruff-format
linting rules before submission.
---
 app/heuristics/__init__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/app/heuristics/__init__.py b/app/heuristics/__init__.py
index a39e76e..000e134 100644
--- a/app/heuristics/__init__.py
+++ b/app/heuristics/__init__.py
@@ -869,9 +869,9 @@ def extract_inputs(text: str, lang: str) -> Dict[str, str]:
             unit = cur3 or cur1 or cur2 or ""
             if cur1 or cur2 or cur3:
                 if v2:
-                    inputs["budget_hint"] = (
-                        f"{_normalize_currency(v1)}-{_normalize_currency(v2)} {unit}".strip()
-                    )
+                    inputs[
+                        "budget_hint"
+                    ] = f"{_normalize_currency(v1)}-{_normalize_currency(v2)} {unit}".strip()
                 else:
                     inputs["budget_hint"] = f"{_normalize_currency(v1)} {unit}".strip()