Skip to content

Commit 6cecec0

Browse files
committed
perf: implement enhanced memory management utilities
- Added aggressive garbage collection and memory checks. - Introduced token counting optimizations with caching and chunking. - Integrated content caching and progressive cleanup for large repos. - Adjusted memory thresholds for better handling of OOM risks.
1 parent d1d7abb commit 6cecec0

File tree

7 files changed

+305
-57
lines changed

7 files changed

+305
-57
lines changed

src/gitingest/config.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@
1111

1212
# Memory optimization settings
1313
BATCH_SIZE = 100 # Process files in batches to reduce memory usage
14-
MEMORY_CHECK_INTERVAL = 50 # Check memory usage every N files
14+
MEMORY_CHECK_INTERVAL = 25 # Check memory usage every N files (more frequent)
15+
AGGRESSIVE_GC_INTERVAL = 10 # Force garbage collection every N files for large repos
16+
MEMORY_PRESSURE_THRESHOLD_MB = 2000 # Trigger aggressive cleanup at 2GB usage
1517

1618
OUTPUT_FILE_NAME = "digest.txt"
1719

src/gitingest/ingestion.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,13 @@
55
from pathlib import Path
66
from typing import TYPE_CHECKING
77

8-
from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_TOTAL_SIZE_BYTES, MEMORY_CHECK_INTERVAL
8+
from gitingest.config import (
9+
AGGRESSIVE_GC_INTERVAL,
10+
MAX_DIRECTORY_DEPTH,
11+
MAX_FILES,
12+
MAX_TOTAL_SIZE_BYTES,
13+
MEMORY_CHECK_INTERVAL,
14+
)
915
from gitingest.output_formatter import format_node
1016
from gitingest.schemas import FileSystemNode, FileSystemNodeType, FileSystemStats
1117
from gitingest.utils.ingestion_utils import _should_exclude, _should_include
@@ -265,14 +271,20 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat
265271
stats.total_files += 1
266272
stats.total_size += file_size
267273

268-
# Check memory usage periodically and force GC if needed
274+
# More aggressive memory management for large repositories
275+
if stats.total_files % AGGRESSIVE_GC_INTERVAL == 0:
276+
force_garbage_collection()
277+
278+
# Check memory usage periodically and force more aggressive GC if needed
269279
if stats.total_files % MEMORY_CHECK_INTERVAL == 0 and check_memory_pressure():
270280
logger.warning(
271-
"Memory pressure detected, forcing garbage collection",
281+
"Memory pressure detected, forcing aggressive garbage collection",
272282
extra={"files_processed": stats.total_files},
273283
)
284+
# Multiple GC cycles for better cleanup
285+
force_garbage_collection()
274286
force_garbage_collection()
275-
log_memory_stats(f"after processing {stats.total_files} files")
287+
log_memory_stats(f"after aggressive cleanup at {stats.total_files} files")
276288

277289
child = FileSystemNode(
278290
name=path.name,

src/gitingest/output_formatter.py

Lines changed: 41 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -2,28 +2,21 @@
22

33
from __future__ import annotations
44

5-
import ssl
65
from io import StringIO
76
from typing import TYPE_CHECKING
87

9-
import requests.exceptions
10-
import tiktoken
11-
128
from gitingest.schemas import FileSystemNode, FileSystemNodeType
139
from gitingest.utils.compat_func import readlink
1410
from gitingest.utils.logging_config import get_logger
11+
from gitingest.utils.memory_utils import force_garbage_collection, log_memory_stats
12+
from gitingest.utils.token_utils import clear_encoding_cache, count_tokens_optimized, format_token_count
1513

1614
if TYPE_CHECKING:
1715
from gitingest.schemas import IngestionQuery
1816

1917
# Initialize logger for this module
2018
logger = get_logger(__name__)
2119

22-
_TOKEN_THRESHOLDS: list[tuple[int, str]] = [
23-
(1_000_000, "M"),
24-
(1_000, "k"),
25-
]
26-
2720

2821
def format_node(node: FileSystemNode, query: IngestionQuery) -> tuple[str, str, str]:
2922
"""Generate a summary, directory structure, and file contents for a given file system node.
@@ -52,13 +45,33 @@ def format_node(node: FileSystemNode, query: IngestionQuery) -> tuple[str, str,
5245
summary += f"File: {node.name}\n"
5346
summary += f"Lines: {len(node.content.splitlines()):,}\n"
5447

48+
# Log memory before tree generation
49+
log_memory_stats("before tree structure generation")
50+
5551
tree = "Directory structure:\n" + _create_tree_structure(query, node=node)
5652

53+
# Log memory before content gathering (this is the memory-intensive part)
54+
log_memory_stats("before content gathering")
55+
5756
content = _gather_file_contents(node)
5857

59-
token_estimate = _format_token_count(tree + content)
60-
if token_estimate:
61-
summary += f"\nEstimated tokens: {token_estimate}"
58+
# Force garbage collection after content gathering
59+
force_garbage_collection()
60+
log_memory_stats("after content gathering and cleanup")
61+
62+
# Count tokens with optimization
63+
token_count = count_tokens_optimized(tree + content)
64+
if token_count > 0:
65+
summary += f"\nEstimated tokens: {format_token_count(token_count)}"
66+
67+
# Final cleanup
68+
if hasattr(node, "clear_content_cache_recursive"):
69+
node.clear_content_cache_recursive()
70+
71+
# Clear the tiktoken encoding cache to free memory
72+
clear_encoding_cache()
73+
force_garbage_collection()
74+
log_memory_stats("after final cache and encoding cleanup")
6275

6376
return summary, tree, content
6477

@@ -133,7 +146,12 @@ def _gather_file_contents(node: FileSystemNode) -> str:
133146

134147

135148
def _gather_file_contents_recursive(node: FileSystemNode, buffer: StringIO) -> None:
136-
"""Recursively gather file contents into a StringIO buffer to reduce memory usage.
149+
"""Recursively gather file contents with memory optimization.
150+
151+
This version includes memory optimizations:
152+
- Progressive content cache clearing
153+
- Periodic garbage collection
154+
- Memory-aware processing
137155
138156
Parameters
139157
----------
@@ -144,12 +162,21 @@ def _gather_file_contents_recursive(node: FileSystemNode, buffer: StringIO) -> N
144162
145163
"""
146164
if node.type != FileSystemNodeType.DIRECTORY:
165+
# Write content and immediately clear cache to free memory
147166
buffer.write(node.content_string)
167+
node.clear_content_cache()
148168
return
149169

150-
for child in node.children:
170+
for files_processed, child in enumerate(node.children, 1):
151171
_gather_file_contents_recursive(child, buffer)
152172

173+
# Progressive cleanup every 10 files to prevent memory accumulation
174+
if files_processed % 10 == 0:
175+
force_garbage_collection()
176+
177+
# Clear content cache for this directory after processing all children
178+
node.clear_content_cache()
179+
153180

154181
def _create_tree_structure(
155182
query: IngestionQuery,
@@ -201,35 +228,3 @@ def _create_tree_structure(
201228
for i, child in enumerate(node.children):
202229
tree_str += _create_tree_structure(query, node=child, prefix=prefix, is_last=i == len(node.children) - 1)
203230
return tree_str
204-
205-
206-
def _format_token_count(text: str) -> str | None:
207-
"""Return a human-readable token-count string (e.g. 1.2k, 1.2 M).
208-
209-
Parameters
210-
----------
211-
text : str
212-
The text string for which the token count is to be estimated.
213-
214-
Returns
215-
-------
216-
str | None
217-
The formatted number of tokens as a string (e.g., ``"1.2k"``, ``"1.2M"``), or ``None`` if an error occurs.
218-
219-
"""
220-
try:
221-
encoding = tiktoken.get_encoding("o200k_base") # gpt-4o, gpt-4o-mini
222-
total_tokens = len(encoding.encode(text, disallowed_special=()))
223-
except (ValueError, UnicodeEncodeError) as exc:
224-
logger.warning("Failed to estimate token size", extra={"error": str(exc)})
225-
return None
226-
except (requests.exceptions.RequestException, ssl.SSLError) as exc:
227-
# If network errors, skip token count estimation instead of erroring out
228-
logger.warning("Failed to download tiktoken model", extra={"error": str(exc)})
229-
return None
230-
231-
for threshold, suffix in _TOKEN_THRESHOLDS:
232-
if total_tokens >= threshold:
233-
return f"{total_tokens / threshold:.1f}{suffix}"
234-
235-
return str(total_tokens)

src/gitingest/schemas/filesystem.py

Lines changed: 50 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ class FileSystemNode: # pylint: disable=too-many-instance-attributes
5050
dir_count: int = 0
5151
depth: int = 0
5252
children: list[FileSystemNode] = field(default_factory=list)
53+
_content_cache: str | None = field(default=None, init=False)
5354

5455
def sort_children(self) -> None:
5556
"""Sort the children nodes of a directory according to a specific order.
@@ -106,10 +107,9 @@ def content_string(self) -> str:
106107

107108
@property
108109
def content(self) -> str: # pylint: disable=too-many-return-statements
109-
"""Return file content (if text / notebook) or an explanatory placeholder.
110+
"""Return file content with caching for memory optimization.
110111
111-
Heuristically decides whether the file is text or binary by decoding a small chunk of the file
112-
with multiple encodings and checking for common binary markers.
112+
Uses lazy loading and caching to reduce memory usage for large repositories.
113113
114114
Returns
115115
-------
@@ -129,14 +129,50 @@ def content(self) -> str: # pylint: disable=too-many-return-statements
129129
if self.type == FileSystemNodeType.SYMLINK:
130130
return "" # TODO: are we including the empty content of symlinks?
131131

132-
if self.path.suffix == ".ipynb": # Notebook
132+
# Return cached content if available
133+
if self._content_cache is not None:
134+
return self._content_cache
135+
136+
# Load and cache content
137+
self._content_cache = self._load_content()
138+
return self._content_cache
139+
140+
def _load_content(self) -> str:
141+
"""Load file content from disk.
142+
143+
Returns
144+
-------
145+
str
146+
The file content
147+
148+
"""
149+
# Handle notebooks separately
150+
if self.path.suffix == ".ipynb":
133151
try:
134152
return process_notebook(self.path)
135153
except Exception as exc:
136154
return f"Error processing notebook: {exc}"
137155

156+
# Read file chunk for analysis
138157
chunk = _read_chunk(self.path)
139158

159+
# Determine the appropriate content based on chunk analysis
160+
return self._analyze_chunk_and_read(chunk)
161+
162+
def _analyze_chunk_and_read(self, chunk: bytes | None) -> str:
163+
"""Analyze file chunk and return appropriate content.
164+
165+
Parameters
166+
----------
167+
chunk : bytes | None
168+
The file chunk to analyze
169+
170+
Returns
171+
-------
172+
str
173+
The file content or error message
174+
175+
"""
140176
if chunk is None:
141177
return "Error reading file"
142178

@@ -187,3 +223,13 @@ def _read_file_content_streaming(self, encoding: str, chunk_size: int = 8192) ->
187223
return content_buffer.getvalue()
188224
finally:
189225
content_buffer.close()
226+
227+
def clear_content_cache(self) -> None:
228+
"""Clear cached content to free memory."""
229+
self._content_cache = None
230+
231+
def clear_content_cache_recursive(self) -> None:
232+
"""Recursively clear content cache for this node and all children."""
233+
self.clear_content_cache()
234+
for child in self.children:
235+
child.clear_content_cache_recursive()

src/gitingest/utils/memory_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def force_garbage_collection() -> None:
4444
logger.warning("Failed to force garbage collection", extra={"error": str(exc)})
4545

4646

47-
def check_memory_pressure(threshold_mb: float = 3000) -> bool:
47+
def check_memory_pressure(threshold_mb: float = 2000) -> bool:
4848
"""Check if memory usage is above threshold.
4949
5050
Parameters

0 commit comments

Comments
 (0)