22
33from __future__ import annotations
44
5- import ssl
65from io import StringIO
76from typing import TYPE_CHECKING
87
9- import requests .exceptions
10- import tiktoken
11-
128from gitingest .schemas import FileSystemNode , FileSystemNodeType
139from gitingest .utils .compat_func import readlink
1410from gitingest .utils .logging_config import get_logger
11+ from gitingest .utils .memory_utils import force_garbage_collection , log_memory_stats
12+ from gitingest .utils .token_utils import clear_encoding_cache , count_tokens_optimized , format_token_count
1513
1614if TYPE_CHECKING :
1715 from gitingest .schemas import IngestionQuery
1816
1917# Initialize logger for this module
2018logger = get_logger (__name__ )
2119
22- _TOKEN_THRESHOLDS : list [tuple [int , str ]] = [
23- (1_000_000 , "M" ),
24- (1_000 , "k" ),
25- ]
26-
2720
2821def format_node (node : FileSystemNode , query : IngestionQuery ) -> tuple [str , str , str ]:
2922 """Generate a summary, directory structure, and file contents for a given file system node.
@@ -52,13 +45,33 @@ def format_node(node: FileSystemNode, query: IngestionQuery) -> tuple[str, str,
5245 summary += f"File: { node .name } \n "
5346 summary += f"Lines: { len (node .content .splitlines ()):,} \n "
5447
48+ # Log memory before tree generation
49+ log_memory_stats ("before tree structure generation" )
50+
5551 tree = "Directory structure:\n " + _create_tree_structure (query , node = node )
5652
53+ # Log memory before content gathering (this is the memory-intensive part)
54+ log_memory_stats ("before content gathering" )
55+
5756 content = _gather_file_contents (node )
5857
59- token_estimate = _format_token_count (tree + content )
60- if token_estimate :
61- summary += f"\n Estimated tokens: { token_estimate } "
58+ # Force garbage collection after content gathering
59+ force_garbage_collection ()
60+ log_memory_stats ("after content gathering and cleanup" )
61+
62+ # Count tokens with optimization
63+ token_count = count_tokens_optimized (tree + content )
64+ if token_count > 0 :
65+ summary += f"\n Estimated tokens: { format_token_count (token_count )} "
66+
67+ # Final cleanup
68+ if hasattr (node , "clear_content_cache_recursive" ):
69+ node .clear_content_cache_recursive ()
70+
71+ # Clear the tiktoken encoding cache to free memory
72+ clear_encoding_cache ()
73+ force_garbage_collection ()
74+ log_memory_stats ("after final cache and encoding cleanup" )
6275
6376 return summary , tree , content
6477
@@ -133,7 +146,12 @@ def _gather_file_contents(node: FileSystemNode) -> str:
133146
134147
135148def _gather_file_contents_recursive (node : FileSystemNode , buffer : StringIO ) -> None :
136- """Recursively gather file contents into a StringIO buffer to reduce memory usage.
149+ """Recursively gather file contents with memory optimization.
150+
151+ This version includes memory optimizations:
152+ - Progressive content cache clearing
153+ - Periodic garbage collection
154+ - Memory-aware processing
137155
138156 Parameters
139157 ----------
@@ -144,12 +162,21 @@ def _gather_file_contents_recursive(node: FileSystemNode, buffer: StringIO) -> N
144162
145163 """
146164 if node .type != FileSystemNodeType .DIRECTORY :
165+ # Write content and immediately clear cache to free memory
147166 buffer .write (node .content_string )
167+ node .clear_content_cache ()
148168 return
149169
150- for child in node .children :
170+ for files_processed , child in enumerate ( node .children , 1 ) :
151171 _gather_file_contents_recursive (child , buffer )
152172
173+ # Progressive cleanup every 10 files to prevent memory accumulation
174+ if files_processed % 10 == 0 :
175+ force_garbage_collection ()
176+
177+ # Clear content cache for this directory after processing all children
178+ node .clear_content_cache ()
179+
153180
154181def _create_tree_structure (
155182 query : IngestionQuery ,
@@ -201,35 +228,3 @@ def _create_tree_structure(
201228 for i , child in enumerate (node .children ):
202229 tree_str += _create_tree_structure (query , node = child , prefix = prefix , is_last = i == len (node .children ) - 1 )
203230 return tree_str
204-
205-
206- def _format_token_count (text : str ) -> str | None :
207- """Return a human-readable token-count string (e.g. 1.2k, 1.2 M).
208-
209- Parameters
210- ----------
211- text : str
212- The text string for which the token count is to be estimated.
213-
214- Returns
215- -------
216- str | None
217- The formatted number of tokens as a string (e.g., ``"1.2k"``, ``"1.2M"``), or ``None`` if an error occurs.
218-
219- """
220- try :
221- encoding = tiktoken .get_encoding ("o200k_base" ) # gpt-4o, gpt-4o-mini
222- total_tokens = len (encoding .encode (text , disallowed_special = ()))
223- except (ValueError , UnicodeEncodeError ) as exc :
224- logger .warning ("Failed to estimate token size" , extra = {"error" : str (exc )})
225- return None
226- except (requests .exceptions .RequestException , ssl .SSLError ) as exc :
227- # If network errors, skip token count estimation instead of erroring out
228- logger .warning ("Failed to download tiktoken model" , extra = {"error" : str (exc )})
229- return None
230-
231- for threshold , suffix in _TOKEN_THRESHOLDS :
232- if total_tokens >= threshold :
233- return f"{ total_tokens / threshold :.1f} { suffix } "
234-
235- return str (total_tokens )
0 commit comments