From 2f6f45bb7558d3f0c27f7ac1465f097e63880003 Mon Sep 17 00:00:00 2001 From: madara88645 <163588475+madara88645@users.noreply.github.com> Date: Tue, 17 Mar 2026 09:30:55 +0000 Subject: [PATCH 1/3] Perf: pre-compile regexes in pick_persona heuristic Pre-compile the PERSONA_KEYWORDS regex dictionary into a module-level `_COMPILED_PERSONA_RX` to avoid regex parsing overhead inside the hot loop of `pick_persona`. Use `p.pattern` in evidence to preserve compatibility. --- .jules/bolt.md | 4 ++++ app/heuristics/__init__.py | 12 +++++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/.jules/bolt.md b/.jules/bolt.md index 2e5e816..b957acd 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -5,3 +5,7 @@ ## 2024-06-20 - Optimizing SQLite JSON Deserialization Cache Size **Learning:** In the backend RAG implementation (`app/rag/simple_index.py`), embeddings are stored as JSON strings in an SQLite database. I initially attempted to optimize repeated deserialization by adding an `lru_cache` bounded to `maxsize=1024` for parsing these strings. However, because similarity searches involve a linear scan over all database chunks, if the database has more than 1,024 chunks, the cache is completely evicted during a single scan, resulting in a 0% cache hit rate on subsequent searches (cache thrashing). **Action:** When caching objects that are iterated over sequentially during database table scans, ensure the cache boundary is sized large enough (e.g., `maxsize=65536`) to accommodate the entire dataset or use an unbounded cache if safe, otherwise the caching mechanism will only add overhead. + +## 2025-03-08 - Optimizing Heuristic Regex Searches +**Learning:** In hot loops evaluating multiple regex patterns (e.g., categorizing user personas in `pick_persona`), iterating over raw string patterns and calling `re.search(p, text)` incurs significant overhead because Python must parse and compile the regex on every call, bypassing the limits of the internal regex cache when many distinct patterns are used. +**Action:** When a heuristic function evaluates a large dictionary or list of constant string patterns on every invocation, pre-compile the patterns into a module-level dictionary (`_COMPILED_PERSONA_RX`) and iterate over the pre-compiled objects (`p.search(text)`). This reduces overhead significantly, avoiding repeated parsing and cache evictions. To maintain backwards compatibility when returning matched evidence, append `p.pattern` to the evidence list instead of the compiled object. diff --git a/app/heuristics/__init__.py b/app/heuristics/__init__.py index 4844442..fdf8623 100644 --- a/app/heuristics/__init__.py +++ b/app/heuristics/__init__.py @@ -195,15 +195,21 @@ } +_COMPILED_PERSONA_RX = { + persona: [re.compile(p) for p in pats] + for persona, pats in PERSONA_KEYWORDS.items() +} + + def pick_persona(text: str) -> tuple[str, dict]: lower = text.lower() scores = {k: 0 for k in PERSONA_KEYWORDS} evidence: dict[str, list[str]] = {k: [] for k in PERSONA_KEYWORDS} - for persona, pats in PERSONA_KEYWORDS.items(): + for persona, pats in _COMPILED_PERSONA_RX.items(): for p in pats: - if re.search(p, lower): + if p.search(lower): scores[persona] += 1 - evidence[persona].append(p) + evidence[persona].append(p.pattern) # choose highest score, tie -> deterministic alphabetical order of persona key ranked = sorted(scores.items(), key=lambda x: (-x[1], x[0])) if ranked and ranked[0][1] > 0: From 1ccd1b002a060dcc65743ea5c12ee9fa22bf39ac Mon Sep 17 00:00:00 2001 From: madara88645 <163588475+madara88645@users.noreply.github.com> Date: Tue, 17 Mar 2026 09:34:49 +0000 Subject: [PATCH 2/3] Perf: pre-compile regexes in pick_persona heuristic Pre-compile the PERSONA_KEYWORDS regex dictionary into a module-level `_COMPILED_PERSONA_RX` to avoid regex parsing overhead inside the hot loop of `pick_persona`. Use `p.pattern` in evidence to preserve compatibility. Also fixed formatting to pass CI. --- app/heuristics/__init__.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/app/heuristics/__init__.py b/app/heuristics/__init__.py index fdf8623..a39e76e 100644 --- a/app/heuristics/__init__.py +++ b/app/heuristics/__init__.py @@ -196,8 +196,7 @@ _COMPILED_PERSONA_RX = { - persona: [re.compile(p) for p in pats] - for persona, pats in PERSONA_KEYWORDS.items() + persona: [re.compile(p) for p in pats] for persona, pats in PERSONA_KEYWORDS.items() } @@ -870,9 +869,9 @@ def extract_inputs(text: str, lang: str) -> Dict[str, str]: unit = cur3 or cur1 or cur2 or "" if cur1 or cur2 or cur3: if v2: - inputs[ - "budget_hint" - ] = f"{_normalize_currency(v1)}-{_normalize_currency(v2)} {unit}".strip() + inputs["budget_hint"] = ( + f"{_normalize_currency(v1)}-{_normalize_currency(v2)} {unit}".strip() + ) else: inputs["budget_hint"] = f"{_normalize_currency(v1)} {unit}".strip() From abbda55ca87618875ac73251751b6e339aa7fcee Mon Sep 17 00:00:00 2001 From: madara88645 <163588475+madara88645@users.noreply.github.com> Date: Tue, 17 Mar 2026 09:37:42 +0000 Subject: [PATCH 3/3] Perf: pre-compile regexes in pick_persona heuristic Pre-compile the PERSONA_KEYWORDS regex dictionary into a module-level `_COMPILED_PERSONA_RX` to avoid regex parsing overhead inside the hot loop of `pick_persona`. Use `p.pattern` in evidence to preserve compatibility. Ran `pre-commit run --all-files` to apply correct ruff-format linting rules before submission. --- app/heuristics/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/app/heuristics/__init__.py b/app/heuristics/__init__.py index a39e76e..000e134 100644 --- a/app/heuristics/__init__.py +++ b/app/heuristics/__init__.py @@ -869,9 +869,9 @@ def extract_inputs(text: str, lang: str) -> Dict[str, str]: unit = cur3 or cur1 or cur2 or "" if cur1 or cur2 or cur3: if v2: - inputs["budget_hint"] = ( - f"{_normalize_currency(v1)}-{_normalize_currency(v2)} {unit}".strip() - ) + inputs[ + "budget_hint" + ] = f"{_normalize_currency(v1)}-{_normalize_currency(v2)} {unit}".strip() else: inputs["budget_hint"] = f"{_normalize_currency(v1)} {unit}".strip()