Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make extract_top_level_blocks() faster #189

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .changes/unreleased/Under the Hood-20240910-153447.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
kind: Under the Hood
body: Improve docs parsing performance.
time: 2024-09-10T15:34:47.389953+02:00
custom:
Author: fredriv
Issue: "9037"
44 changes: 41 additions & 3 deletions dbt_common/clients/_jinja_blocks.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import dataclasses
import re
from collections import namedtuple
from typing import Iterator, List, Optional, Set, Union
from typing import Dict, Iterator, List, Optional, Set, Union

from dbt_common.exceptions import (
BlockDefinitionNotAtTopError,
Expand Down Expand Up @@ -104,11 +105,24 @@ def end_pat(self) -> re.Pattern:
QUOTE_START_PATTERN = regex(r"""(?P<quote>(['"]))""")


@dataclasses.dataclass
class PositionedMatch:
"""Used to accelerate TagIterator. Records the result of searching a string, starting
at start_pos and finding match (or None)."""

start_pos: int
match: Optional[re.Match]


class TagIterator:
def __init__(self, text: str) -> None:
self.text: str = text
self.pos: int = 0

# Performance enhancement: A cache of the most recent matches seen for each pattern.
# Includes the start position used for the search.
self._past_matches: Dict[re.Pattern, PositionedMatch] = {}

def linepos(self, end: Optional[int] = None) -> str:
"""Return relative position in line.

Expand All @@ -129,8 +143,32 @@ def advance(self, new_position: int) -> None:
def rewind(self, amount: int = 1) -> None:
self.pos -= amount

def _search(self, pattern: re.Pattern) -> Optional[re.Match]:
return pattern.search(self.text, self.pos)
def _search(self, pattern) -> Optional[re.Match]:
# Check to see if we have a cached search on this pattern.
positioned_match = self._past_matches.get(pattern)

if positioned_match is None or positioned_match.start_pos > self.pos:
# We did not have a cached search, or we did, but it was done at a location
# further along in the string. Do a new search and cache it.
match = pattern.search(self.text, self.pos)
self._past_matches[pattern] = PositionedMatch(self.pos, match)
else:
# We have a cached search and its start position falls before (or at) the
# current search position...
if positioned_match.match is None:
# ...but there is no match in the rest of the 'data'.
match = None
elif positioned_match.match.start() >= self.pos:
# ...and there is a match we can reuse, because we have not yet passed
# the start position of the match. It's still the next match.
match = positioned_match.match
else:
# ...but we have passed the start of the cached match, and need to do a
# new search from our current position and cache it.
match = pattern.search(self.text, self.pos)
self._past_matches[pattern] = PositionedMatch(self.pos, match)

return match

def _match(self, pattern: re.Pattern) -> Optional[re.Match]:
return pattern.match(self.text, self.pos)
Expand Down
Loading