Skip to content

Commit d17d332

Browse files
committed
feat: add memory optimizations to prevent OOM issues
1 parent c057f6e commit d17d332

File tree

9 files changed

+180
-8
lines changed

9 files changed

+180
-8
lines changed

.pre-commit-config.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ repos:
122122
loguru>=0.7.0,
123123
pathspec>=0.12.1,
124124
prometheus-client,
125+
psutil>=5.9.0,
125126
pydantic,
126127
pytest-asyncio,
127128
pytest-mock,
@@ -150,6 +151,7 @@ repos:
150151
loguru>=0.7.0,
151152
pathspec>=0.12.1,
152153
prometheus-client,
154+
psutil>=5.9.0,
153155
pydantic,
154156
pytest-asyncio,
155157
pytest-mock,

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ server = [
4848
"boto3>=1.28.0", # AWS SDK for S3 support
4949
"fastapi[standard]>=0.109.1", # Minimum safe release (https://osv.dev/vulnerability/PYSEC-2024-38)
5050
"prometheus-client",
51+
"psutil>=5.9.0", # Memory monitoring for optimization
5152
"sentry-sdk[fastapi]",
5253
"slowapi",
5354
"uvicorn>=0.11.7", # Minimum safe release (https://osv.dev/vulnerability/PYSEC-2020-150)

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ httpx
55
loguru>=0.7.0
66
pathspec>=0.12.1
77
prometheus-client
8+
psutil>=5.9.0 # Memory monitoring for optimization
89
pydantic
910
python-dotenv
1011
sentry-sdk[fastapi]

src/gitingest/config.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@
99
MAX_TOTAL_SIZE_BYTES = 500 * 1024 * 1024 # Maximum size of output file (500 MB)
1010
DEFAULT_TIMEOUT = 60 # seconds
1111

12+
# Memory optimization settings
13+
BATCH_SIZE = 100 # Process files in batches to reduce memory usage
14+
MEMORY_CHECK_INTERVAL = 50 # Check memory usage every N files
15+
1216
OUTPUT_FILE_NAME = "digest.txt"
1317

1418
TMP_BASE_PATH = Path(tempfile.gettempdir()) / "gitingest"

src/gitingest/ingestion.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,12 @@
55
from pathlib import Path
66
from typing import TYPE_CHECKING
77

8-
from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_TOTAL_SIZE_BYTES
8+
from gitingest.config import MAX_DIRECTORY_DEPTH, MAX_FILES, MAX_TOTAL_SIZE_BYTES, MEMORY_CHECK_INTERVAL
99
from gitingest.output_formatter import format_node
1010
from gitingest.schemas import FileSystemNode, FileSystemNodeType, FileSystemStats
1111
from gitingest.utils.ingestion_utils import _should_exclude, _should_include
1212
from gitingest.utils.logging_config import get_logger
13+
from gitingest.utils.memory_utils import check_memory_pressure, force_garbage_collection, log_memory_stats
1314

1415
if TYPE_CHECKING:
1516
from gitingest.schemas import IngestionQuery
@@ -51,6 +52,9 @@ def ingest_query(query: IngestionQuery) -> tuple[str, str, str]:
5152
},
5253
)
5354

55+
# Log initial memory usage
56+
log_memory_stats("at ingestion start")
57+
5458
subpath = Path(query.subpath.strip("/")).as_posix()
5559
path = query.local_path / subpath
5660

@@ -117,6 +121,9 @@ def ingest_query(query: IngestionQuery) -> tuple[str, str, str]:
117121
},
118122
)
119123

124+
# Log final memory usage
125+
log_memory_stats("at ingestion completion")
126+
120127
return format_node(root_node, query=query)
121128

122129

@@ -258,6 +265,15 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat
258265
stats.total_files += 1
259266
stats.total_size += file_size
260267

268+
# Check memory usage periodically and force GC if needed
269+
if stats.total_files % MEMORY_CHECK_INTERVAL == 0 and check_memory_pressure():
270+
logger.warning(
271+
"Memory pressure detected, forcing garbage collection",
272+
extra={"files_processed": stats.total_files},
273+
)
274+
force_garbage_collection()
275+
log_memory_stats(f"after processing {stats.total_files} files")
276+
261277
child = FileSystemNode(
262278
name=path.name,
263279
type=FileSystemNodeType.FILE,

src/gitingest/output_formatter.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from __future__ import annotations
44

55
import ssl
6+
from io import StringIO
67
from typing import TYPE_CHECKING
78

89
import requests.exceptions
@@ -122,8 +123,32 @@ def _gather_file_contents(node: FileSystemNode) -> str:
122123
if node.type != FileSystemNodeType.DIRECTORY:
123124
return node.content_string
124125

125-
# Recursively gather contents of all files under the current directory
126-
return "\n".join(_gather_file_contents(child) for child in node.children)
126+
# Use StringIO for memory-efficient string concatenation
127+
content_buffer = StringIO()
128+
try:
129+
_gather_file_contents_recursive(node, content_buffer)
130+
return content_buffer.getvalue()
131+
finally:
132+
content_buffer.close()
133+
134+
135+
def _gather_file_contents_recursive(node: FileSystemNode, buffer: StringIO) -> None:
136+
"""Recursively gather file contents into a StringIO buffer to reduce memory usage.
137+
138+
Parameters
139+
----------
140+
node : FileSystemNode
141+
The current directory or file node being processed.
142+
buffer : StringIO
143+
Buffer to write content to.
144+
145+
"""
146+
if node.type != FileSystemNodeType.DIRECTORY:
147+
buffer.write(node.content_string)
148+
return
149+
150+
for child in node.children:
151+
_gather_file_contents_recursive(child, buffer)
127152

128153

129154
def _create_tree_structure(

src/gitingest/schemas/filesystem.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import os
66
from dataclasses import dataclass, field
77
from enum import Enum, auto
8+
from io import StringIO
89
from typing import TYPE_CHECKING
910

1011
from gitingest.utils.compat_func import readlink
@@ -155,7 +156,34 @@ def content(self) -> str: # pylint: disable=too-many-return-statements
155156
return "Error: Unable to decode file with available encodings"
156157

157158
try:
158-
with self.path.open(encoding=good_enc) as fp:
159-
return fp.read()
159+
return self._read_file_content_streaming(good_enc)
160160
except (OSError, UnicodeDecodeError) as exc:
161161
return f"Error reading file with {good_enc!r}: {exc}"
162+
163+
def _read_file_content_streaming(self, encoding: str, chunk_size: int = 8192) -> str:
164+
"""Read file content using streaming to reduce memory usage.
165+
166+
Parameters
167+
----------
168+
encoding : str
169+
The encoding to use for reading the file.
170+
chunk_size : int
171+
Size of chunks to read at a time (default: 8192 bytes).
172+
173+
Returns
174+
-------
175+
str
176+
The file content.
177+
178+
"""
179+
content_buffer = StringIO()
180+
try:
181+
with self.path.open(encoding=encoding) as fp:
182+
while True:
183+
chunk = fp.read(chunk_size)
184+
if not chunk:
185+
break
186+
content_buffer.write(chunk)
187+
return content_buffer.getvalue()
188+
finally:
189+
content_buffer.close()
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
"""Memory utility functions for monitoring and optimization."""
2+
3+
from __future__ import annotations
4+
5+
import gc
6+
from typing import Any
7+
8+
import psutil
9+
10+
from gitingest.utils.logging_config import get_logger
11+
12+
logger = get_logger(__name__)
13+
14+
15+
def get_memory_usage() -> dict[str, Any]:
16+
"""Get current memory usage statistics.
17+
18+
Returns
19+
-------
20+
dict[str, Any]
21+
Dictionary containing memory usage statistics in MB.
22+
23+
"""
24+
try:
25+
process = psutil.Process()
26+
memory_info = process.memory_info()
27+
28+
return {
29+
"rss_mb": memory_info.rss / (1024 * 1024), # Resident Set Size
30+
"vms_mb": memory_info.vms / (1024 * 1024), # Virtual Memory Size
31+
"percent": process.memory_percent(),
32+
}
33+
except Exception as exc:
34+
logger.warning("Failed to get memory usage", extra={"error": str(exc)})
35+
return {"rss_mb": 0, "vms_mb": 0, "percent": 0}
36+
37+
38+
def force_garbage_collection() -> None:
39+
"""Force garbage collection to free up memory."""
40+
try:
41+
collected = gc.collect()
42+
logger.debug("Forced garbage collection", extra={"objects_collected": collected})
43+
except Exception as exc:
44+
logger.warning("Failed to force garbage collection", extra={"error": str(exc)})
45+
46+
47+
def check_memory_pressure(threshold_mb: float = 3000) -> bool:
48+
"""Check if memory usage is above threshold.
49+
50+
Parameters
51+
----------
52+
threshold_mb : float
53+
Memory threshold in MB (default: 3000 MB = 3 GB).
54+
55+
Returns
56+
-------
57+
bool
58+
True if memory usage is above threshold.
59+
60+
"""
61+
memory_stats = get_memory_usage()
62+
return memory_stats["rss_mb"] > threshold_mb
63+
64+
65+
def log_memory_stats(context: str = "") -> None:
66+
"""Log current memory statistics.
67+
68+
Parameters
69+
----------
70+
context : str
71+
Context information for the log message.
72+
73+
"""
74+
memory_stats = get_memory_usage()
75+
logger.info(
76+
"Memory usage %s",
77+
context,
78+
extra={
79+
"memory_rss_mb": round(memory_stats["rss_mb"], 2),
80+
"memory_vms_mb": round(memory_stats["vms_mb"], 2),
81+
"memory_percent": round(memory_stats["percent"], 2),
82+
},
83+
)

src/server/query_processor.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from __future__ import annotations
44

55
import shutil
6+
from io import StringIO
67
from pathlib import Path
78
from typing import TYPE_CHECKING, cast
89

@@ -302,7 +303,19 @@ async def process_query(
302303

303304
try:
304305
summary, tree, content = ingest_query(query)
305-
digest_content = tree + "\n" + content
306+
307+
# Clean up repository immediately after ingestion to free memory
308+
_cleanup_repository(clone_config)
309+
310+
# Use StringIO for memory-efficient string concatenation
311+
digest_buffer = StringIO()
312+
try:
313+
digest_buffer.write(tree)
314+
digest_buffer.write("\n")
315+
digest_buffer.write(content)
316+
digest_content = digest_buffer.getvalue()
317+
finally:
318+
digest_buffer.close()
306319
_store_digest_content(query, clone_config, digest_content, summary, tree, content)
307320
except Exception as exc:
308321
_print_error(query.url, exc, max_file_size, pattern_type, pattern)
@@ -326,8 +339,7 @@ async def process_query(
326339

327340
digest_url = _generate_digest_url(query)
328341

329-
# Clean up the repository after successful processing
330-
_cleanup_repository(clone_config)
342+
# Repository was already cleaned up after ingestion to free memory earlier
331343

332344
return IngestSuccessResponse(
333345
repo_url=input_text,

0 commit comments

Comments
 (0)