From 0c973df5fee9586663a72ccedae2cc9610802d0e Mon Sep 17 00:00:00 2001 From: madara88645 <163588475+madara88645@users.noreply.github.com> Date: Wed, 18 Mar 2026 09:27:43 +0000 Subject: [PATCH 1/2] Optimize regex and string matching in psycholinguist.py Replaced `re.search` with `in` for substring matches and pre-compiled regexes for cultural, sentiment, formality, and ambiguity heuristics to speed up hot loops. Preserved original scoring functionality. --- .jules/bolt.md | 4 ++ app/heuristics/handlers/psycholinguist.py | 82 +++++++++++++++-------- 2 files changed, 57 insertions(+), 29 deletions(-) diff --git a/.jules/bolt.md b/.jules/bolt.md index 38f48bad..03ad84f9 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -9,3 +9,7 @@ ## 2024-07-26 - Fast Vector Dot Products in Python **Learning:** For vector dot products in Python (without numpy), using `sum(map(operator.mul, vec_a, vec_b))` is approximately 30% to 40% faster than list comprehensions inside `sum([a * b for a, b in zip(vec_a, vec_b)])`. This is because it avoids the overhead of allocating an intermediate list in memory and pushes both iteration and multiplication to optimized C-level implementations. **Action:** When calculating similarity scores or dot products on vectors represented as Python lists, always prefer `map(operator.mul, a, b)` wrapped in `sum()` over list comprehensions or generator expressions. + +## 2024-08-14 - Optimizing Multiple Regex Pattern Matching Logic +**Learning:** When optimizing Python loops that count distinct regex pattern matches (e.g., `sum(1 for p in PATTERNS if re.search(p, text))`), joining all patterns into a single compiled regex (`re.compile('a|b').findall(text)`) introduces a functional regression because it counts the *total occurrences* of any pattern, not the *number of distinct patterns* matched. +**Action:** To safely optimize this logic while preserving exact functionality, pre-compile a list of distinct regular expressions at the module level and iterate through them: `sum(1 for r in COMPILED_REGEXES if r.search(text))`. Additionally, replace `re.search` with the native `in` operator (`p in text`) for exact string literals that don't rely on regex word boundaries, as it is significantly faster. diff --git a/app/heuristics/handlers/psycholinguist.py b/app/heuristics/handlers/psycholinguist.py index 7f853bd0..7f9200d6 100644 --- a/app/heuristics/handlers/psycholinguist.py +++ b/app/heuristics/handlers/psycholinguist.py @@ -143,23 +143,23 @@ class CognitiveLoadResult: # Cultural/Regional patterns UK_SPELLING = [ - r"colour", - r"flavour", - r"centre", - r"metre", - r"organise", - r"realise", - r"defence", + "colour", + "flavour", + "centre", + "metre", + "organise", + "realise", + "defence", ] US_SPELLING = [ - r"color", - r"flavor", - r"center", - r"meter", - r"organize", - r"realize", - r"defense", + "color", + "flavor", + "center", + "meter", + "organize", + "realize", + "defense", ] CURRENCY_PATTERNS = { @@ -170,13 +170,20 @@ class CognitiveLoadResult: } +# Pre-compiled regexes for performance +_CURRENCY_REGEXES = { + region: re.compile("|".join(patterns)) for region, patterns in CURRENCY_PATTERNS.items() +} + + def detect_cultural_context(text: str) -> Optional[str]: """Detect cultural context based on spelling and currency.""" text_lower = text.lower() # Check spelling - uk_score = sum(1 for p in UK_SPELLING if re.search(p, text_lower)) - us_score = sum(1 for p in US_SPELLING if re.search(p, text_lower)) + # Bolt Optimization: direct string `in` check is faster than `re.search` without word boundaries + uk_score = sum(1 for p in UK_SPELLING if p in text_lower) + us_score = sum(1 for p in US_SPELLING if p in text_lower) if uk_score > us_score: return "British" @@ -184,8 +191,9 @@ def detect_cultural_context(text: str) -> Optional[str]: return "American" # Check currency - for region, patterns in CURRENCY_PATTERNS.items(): - if any(re.search(p, text_lower) for p in patterns): + for region, regex in _CURRENCY_REGEXES.items(): + # Bolt Optimization: compiled regex `search` on joined patterns avoids overhead of multiple `re.search` calls + if regex.search(text_lower): if region == "TR": return "Turkish" if region == "US": @@ -198,34 +206,43 @@ def detect_cultural_context(text: str) -> Optional[str]: return None +_FRUSTRATION_REGEX = re.compile("|".join(FRUSTRATION_PATTERNS), re.IGNORECASE) +_CASUAL_REGEX = re.compile("|".join(CASUAL_PATTERNS)) + + def detect_sentiment(text: str) -> UserSentiment: """Analyze text to detect user sentiment.""" text_lower = text.lower() # Check for urgency + # Bolt Optimization: use pre-compiled regex and `in` for faster matching for kw in URGENT_KEYWORDS: if kw in text_lower: return UserSentiment.URGENT # Check for frustration (patterns on original text for CAPS detection) - for pattern in FRUSTRATION_PATTERNS: - if re.search(pattern, text, re.IGNORECASE): - return UserSentiment.FRUSTRATED + if _FRUSTRATION_REGEX.search(text): + return UserSentiment.FRUSTRATED # Check for casual tone - for pattern in CASUAL_PATTERNS: - if re.search(pattern, text_lower): - return UserSentiment.CASUAL + if _CASUAL_REGEX.search(text_lower): + return UserSentiment.CASUAL return UserSentiment.NEUTRAL +_TR_FORMAL_REGEXES = [re.compile(p) for p in TR_FORMAL_PATTERNS] +_TR_INFORMAL_REGEXES = [re.compile(p) for p in TR_INFORMAL_PATTERNS] + + def detect_formality(text: str) -> FormalityLevel: """Detect Turkish formality level (Siz vs Sen).""" text_lower = text.lower() - formal_score = sum(1 for p in TR_FORMAL_PATTERNS if re.search(p, text_lower)) - informal_score = sum(1 for p in TR_INFORMAL_PATTERNS if re.search(p, text_lower)) + # Bolt Optimization: pre-compiled regexes avoid looping `re.compile` + # Kept as separate regexes to match original logic of counting distinct matched patterns + formal_score = sum(1 for r in _TR_FORMAL_REGEXES if r.search(text_lower)) + informal_score = sum(1 for r in _TR_INFORMAL_REGEXES if r.search(text_lower)) if formal_score > informal_score: return FormalityLevel.FORMAL @@ -297,16 +314,23 @@ class AmbiguityResult: suggestions: list[str] = field(default_factory=list) +_AMBIGUOUS_REGEXES = { + key: (re.compile(rule["pattern"]), rule["suggestion"]) + for key, rule in AMBIGUOUS_PATTERNS.items() +} + + def detect_ambiguity(text: str) -> AmbiguityResult: """Detect vague or ambiguous terms in the prompt.""" text_lower = text.lower() result = AmbiguityResult() - for key, rule in AMBIGUOUS_PATTERNS.items(): - if re.search(rule["pattern"], text_lower): + # Bolt Optimization: avoid re.search compiling the pattern each time + for key, (regex, suggestion) in _AMBIGUOUS_REGEXES.items(): + if regex.search(text_lower): result.is_ambiguous = True result.ambiguous_terms.append(key) - result.suggestions.append(rule["suggestion"]) + result.suggestions.append(suggestion) return result From 575888ece9fe48681db19940f990c7e0333f067a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mehmet=20=C3=96zel?= <163588475+madara88645@users.noreply.github.com> Date: Wed, 18 Mar 2026 20:45:13 +0000 Subject: [PATCH 2/2] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- app/heuristics/handlers/psycholinguist.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/heuristics/handlers/psycholinguist.py b/app/heuristics/handlers/psycholinguist.py index 7f9200d6..d0113131 100644 --- a/app/heuristics/handlers/psycholinguist.py +++ b/app/heuristics/handlers/psycholinguist.py @@ -220,7 +220,7 @@ def detect_sentiment(text: str) -> UserSentiment: if kw in text_lower: return UserSentiment.URGENT - # Check for frustration (patterns on original text for CAPS detection) + # Check for frustration using pre-compiled regex if _FRUSTRATION_REGEX.search(text): return UserSentiment.FRUSTRATED