perf: implement enhanced memory management utilities

MickaelCa · MickaelCa · commit 6cecec0fa75c · 2025-08-14T10:40:05.000+02:00
- Added aggressive garbage collection and memory checks.
- Introduced token counting optimizations with caching and chunking.
- Integrated content caching and progressive cleanup for large repos.
- Adjusted memory thresholds for better handling of OOM risks.
diff --git a/src/gitingest/config.py b/src/gitingest/config.py
@@ -11,7 +11,9 @@
 
 # Memory optimization settings
 BATCH_SIZE = 100  # Process files in batches to reduce memory usage
-MEMORY_CHECK_INTERVAL = 50  # Check memory usage every N files
+MEMORY_CHECK_INTERVAL = 25  # Check memory usage every N files (more frequent)
+AGGRESSIVE_GC_INTERVAL = 10  # Force garbage collection every N files for large repos
+MEMORY_PRESSURE_THRESHOLD_MB = 2000  # Trigger aggressive cleanup at 2GB usage
 
 OUTPUT_FILE_NAME = "digest.txt"
 
diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py
@@ -5,7 +5,13 @@
 from pathlib import Path
 from typing import TYPE_CHECKING
 
-from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_TOTAL_SIZE_BYTES, MEMORY_CHECK_INTERVAL
+from gitingest.config import (
+    AGGRESSIVE_GC_INTERVAL,
+    MAX_DIRECTORY_DEPTH,
+    MAX_FILES,
+    MAX_TOTAL_SIZE_BYTES,
+    MEMORY_CHECK_INTERVAL,
+)
 from gitingest.output_formatter import format_node
 from gitingest.schemas import FileSystemNode, FileSystemNodeType, FileSystemStats
 from gitingest.utils.ingestion_utils import _should_exclude, _should_include
@@ -265,14 +271,20 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat
     stats.total_files += 1
     stats.total_size += file_size
 
-    # Check memory usage periodically and force GC if needed
+    # More aggressive memory management for large repositories
+    if stats.total_files % AGGRESSIVE_GC_INTERVAL == 0:
+        force_garbage_collection()
+
+    # Check memory usage periodically and force more aggressive GC if needed
     if stats.total_files % MEMORY_CHECK_INTERVAL == 0 and check_memory_pressure():
         logger.warning(
-            "Memory pressure detected, forcing garbage collection",
+            "Memory pressure detected, forcing aggressive garbage collection",
             extra={"files_processed": stats.total_files},
         )
+        # Multiple GC cycles for better cleanup
+        force_garbage_collection()
         force_garbage_collection()
-        log_memory_stats(f"after processing {stats.total_files} files")
+        log_memory_stats(f"after aggressive cleanup at {stats.total_files} files")
 
     child = FileSystemNode(
         name=path.name,
diff --git a/src/gitingest/output_formatter.py b/src/gitingest/output_formatter.py
@@ -2,28 +2,21 @@
 
 from __future__ import annotations
 
-import ssl
 from io import StringIO
 from typing import TYPE_CHECKING
 
-import requests.exceptions
-import tiktoken
-
 from gitingest.schemas import FileSystemNode, FileSystemNodeType
 from gitingest.utils.compat_func import readlink
 from gitingest.utils.logging_config import get_logger
+from gitingest.utils.memory_utils import force_garbage_collection, log_memory_stats
+from gitingest.utils.token_utils import clear_encoding_cache, count_tokens_optimized, format_token_count
 
 if TYPE_CHECKING:
     from gitingest.schemas import IngestionQuery
 
 # Initialize logger for this module
 logger = get_logger(__name__)
 
-_TOKEN_THRESHOLDS: list[tuple[int, str]] = [
-    (1_000_000, "M"),
-    (1_000, "k"),
-]
-
 
 def format_node(node: FileSystemNode, query: IngestionQuery) -> tuple[str, str, str]:
     """Generate a summary, directory structure, and file contents for a given file system node.
@@ -52,13 +45,33 @@ def format_node(node: FileSystemNode, query: IngestionQuery) -> tuple[str, str,
         summary += f"File: {node.name}\n"
         summary += f"Lines: {len(node.content.splitlines()):,}\n"
 
+    # Log memory before tree generation
+    log_memory_stats("before tree structure generation")
+
     tree = "Directory structure:\n" + _create_tree_structure(query, node=node)
 
+    # Log memory before content gathering (this is the memory-intensive part)
+    log_memory_stats("before content gathering")
+
     content = _gather_file_contents(node)
 
-    token_estimate = _format_token_count(tree + content)
-    if token_estimate:
-        summary += f"\nEstimated tokens: {token_estimate}"
+    # Force garbage collection after content gathering
+    force_garbage_collection()
+    log_memory_stats("after content gathering and cleanup")
+
+    # Count tokens with optimization
+    token_count = count_tokens_optimized(tree + content)
+    if token_count > 0:
+        summary += f"\nEstimated tokens: {format_token_count(token_count)}"
+
+    # Final cleanup
+    if hasattr(node, "clear_content_cache_recursive"):
+        node.clear_content_cache_recursive()
+
+    # Clear the tiktoken encoding cache to free memory
+    clear_encoding_cache()
+    force_garbage_collection()
+    log_memory_stats("after final cache and encoding cleanup")
 
     return summary, tree, content
 
@@ -133,7 +146,12 @@ def _gather_file_contents(node: FileSystemNode) -> str:
 
 
 def _gather_file_contents_recursive(node: FileSystemNode, buffer: StringIO) -> None:
-    """Recursively gather file contents into a StringIO buffer to reduce memory usage.
+    """Recursively gather file contents with memory optimization.
+
+    This version includes memory optimizations:
+    - Progressive content cache clearing
+    - Periodic garbage collection
+    - Memory-aware processing
 
     Parameters
     ----------
@@ -144,12 +162,21 @@ def _gather_file_contents_recursive(node: FileSystemNode, buffer: StringIO) -> N
 
     """
     if node.type != FileSystemNodeType.DIRECTORY:
+        # Write content and immediately clear cache to free memory
         buffer.write(node.content_string)
+        node.clear_content_cache()
         return
 
-    for child in node.children:
+    for files_processed, child in enumerate(node.children, 1):
         _gather_file_contents_recursive(child, buffer)
 
+        # Progressive cleanup every 10 files to prevent memory accumulation
+        if files_processed % 10 == 0:
+            force_garbage_collection()
+
+    # Clear content cache for this directory after processing all children
+    node.clear_content_cache()
+
 
 def _create_tree_structure(
     query: IngestionQuery,
@@ -201,35 +228,3 @@ def _create_tree_structure(
         for i, child in enumerate(node.children):
             tree_str += _create_tree_structure(query, node=child, prefix=prefix, is_last=i == len(node.children) - 1)
     return tree_str
-
-
-def _format_token_count(text: str) -> str | None:
-    """Return a human-readable token-count string (e.g. 1.2k, 1.2 M).
-
-    Parameters
-    ----------
-    text : str
-        The text string for which the token count is to be estimated.
-
-    Returns
-    -------
-    str | None
-        The formatted number of tokens as a string (e.g., ``"1.2k"``, ``"1.2M"``), or ``None`` if an error occurs.
-
-    """
-    try:
-        encoding = tiktoken.get_encoding("o200k_base")  # gpt-4o, gpt-4o-mini
-        total_tokens = len(encoding.encode(text, disallowed_special=()))
-    except (ValueError, UnicodeEncodeError) as exc:
-        logger.warning("Failed to estimate token size", extra={"error": str(exc)})
-        return None
-    except (requests.exceptions.RequestException, ssl.SSLError) as exc:
-        # If network errors, skip token count estimation instead of erroring out
-        logger.warning("Failed to download tiktoken model", extra={"error": str(exc)})
-        return None
-
-    for threshold, suffix in _TOKEN_THRESHOLDS:
-        if total_tokens >= threshold:
-            return f"{total_tokens / threshold:.1f}{suffix}"
-
-    return str(total_tokens)
diff --git a/src/gitingest/schemas/filesystem.py b/src/gitingest/schemas/filesystem.py
@@ -50,6 +50,7 @@ class FileSystemNode:  # pylint: disable=too-many-instance-attributes
     dir_count: int = 0
     depth: int = 0
     children: list[FileSystemNode] = field(default_factory=list)
+    _content_cache: str | None = field(default=None, init=False)
 
     def sort_children(self) -> None:
         """Sort the children nodes of a directory according to a specific order.
@@ -106,10 +107,9 @@ def content_string(self) -> str:
 
     @property
     def content(self) -> str:  # pylint: disable=too-many-return-statements
-        """Return file content (if text / notebook) or an explanatory placeholder.
+        """Return file content with caching for memory optimization.
 
-        Heuristically decides whether the file is text or binary by decoding a small chunk of the file
-        with multiple encodings and checking for common binary markers.
+        Uses lazy loading and caching to reduce memory usage for large repositories.
 
         Returns
         -------
@@ -129,14 +129,50 @@ def content(self) -> str:  # pylint: disable=too-many-return-statements
         if self.type == FileSystemNodeType.SYMLINK:
             return ""  # TODO: are we including the empty content of symlinks?
 
-        if self.path.suffix == ".ipynb":  # Notebook
+        # Return cached content if available
+        if self._content_cache is not None:
+            return self._content_cache
+
+        # Load and cache content
+        self._content_cache = self._load_content()
+        return self._content_cache
+
+    def _load_content(self) -> str:
+        """Load file content from disk.
+
+        Returns
+        -------
+        str
+            The file content
+
+        """
+        # Handle notebooks separately
+        if self.path.suffix == ".ipynb":
             try:
                 return process_notebook(self.path)
             except Exception as exc:
                 return f"Error processing notebook: {exc}"
 
+        # Read file chunk for analysis
         chunk = _read_chunk(self.path)
 
+        # Determine the appropriate content based on chunk analysis
+        return self._analyze_chunk_and_read(chunk)
+
+    def _analyze_chunk_and_read(self, chunk: bytes | None) -> str:
+        """Analyze file chunk and return appropriate content.
+
+        Parameters
+        ----------
+        chunk : bytes | None
+            The file chunk to analyze
+
+        Returns
+        -------
+        str
+            The file content or error message
+
+        """
         if chunk is None:
             return "Error reading file"
 
@@ -187,3 +223,13 @@ def _read_file_content_streaming(self, encoding: str, chunk_size: int = 8192) ->
             return content_buffer.getvalue()
         finally:
             content_buffer.close()
+
+    def clear_content_cache(self) -> None:
+        """Clear cached content to free memory."""
+        self._content_cache = None
+
+    def clear_content_cache_recursive(self) -> None:
+        """Recursively clear content cache for this node and all children."""
+        self.clear_content_cache()
+        for child in self.children:
+            child.clear_content_cache_recursive()
diff --git a/src/gitingest/utils/memory_utils.py b/src/gitingest/utils/memory_utils.py
@@ -44,7 +44,7 @@ def force_garbage_collection() -> None:
         logger.warning("Failed to force garbage collection", extra={"error": str(exc)})
 
 
-def check_memory_pressure(threshold_mb: float = 3000) -> bool:
+def check_memory_pressure(threshold_mb: float = 2000) -> bool:
     """Check if memory usage is above threshold.
 
     Parameters
diff --git a/src/gitingest/utils/token_utils.py b/src/gitingest/utils/token_utils.py
diff --git a/src/server/query_processor.py b/src/server/query_processor.py