Skip to content

Commit 8e4e37c

Browse files
committed
feat: add prometheus memory metrics for ingestion monitoring
1 parent d17d332 commit 8e4e37c

File tree

4 files changed

+183
-27
lines changed

4 files changed

+183
-27
lines changed

src/gitingest/utils/memory_utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,10 +72,10 @@ def log_memory_stats(context: str = "") -> None:
7272
7373
"""
7474
memory_stats = get_memory_usage()
75-
logger.info(
76-
"Memory usage %s",
77-
context,
75+
logger.debug(
76+
"Memory usage statistics",
7877
extra={
78+
"context": context,
7979
"memory_rss_mb": round(memory_stats["rss_mb"], 2),
8080
"memory_vms_mb": round(memory_stats["vms_mb"], 2),
8181
"memory_percent": round(memory_stats["percent"], 2),

src/server/memory_metrics.py

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
"""Memory usage metrics for Prometheus monitoring."""
2+
3+
from __future__ import annotations
4+
5+
from typing import TYPE_CHECKING
6+
7+
from prometheus_client import Gauge, Histogram
8+
9+
from gitingest.utils.memory_utils import get_memory_usage
10+
11+
if TYPE_CHECKING:
12+
import types
13+
from typing import Self
14+
15+
# Memory usage gauges
16+
memory_usage_rss_mb = Gauge(
17+
"gitingest_memory_usage_rss_mb",
18+
"Resident Set Size memory usage in MB",
19+
["repo_url"],
20+
)
21+
22+
memory_usage_vms_mb = Gauge(
23+
"gitingest_memory_usage_vms_mb",
24+
"Virtual Memory Size usage in MB",
25+
["repo_url"],
26+
)
27+
28+
memory_usage_percent = Gauge(
29+
"gitingest_memory_usage_percent",
30+
"Memory usage percentage",
31+
["repo_url"],
32+
)
33+
34+
# Memory usage histogram to track distribution of memory consumption per repository
35+
memory_consumption_histogram = Histogram(
36+
"gitingest_memory_consumption_mb",
37+
"Memory consumption distribution per repository in MB",
38+
["repo_url"],
39+
buckets=(50, 100, 250, 500, 1000, 2000, 3000, 5000, 10000, float("inf")),
40+
)
41+
42+
# Peak memory usage gauge
43+
peak_memory_usage_mb = Gauge(
44+
"gitingest_peak_memory_usage_mb",
45+
"Peak memory usage during ingestion in MB",
46+
["repo_url"],
47+
)
48+
49+
50+
def record_memory_usage(repo_url: str) -> dict[str, float]:
51+
"""Record current memory usage metrics for a repository.
52+
53+
Parameters
54+
----------
55+
repo_url : str
56+
The repository URL to label the metrics with
57+
58+
Returns
59+
-------
60+
dict[str, float]
61+
Current memory usage statistics
62+
63+
"""
64+
# Truncate URL for label to avoid excessive cardinality
65+
repo_label = repo_url[:255]
66+
67+
# Get current memory stats
68+
memory_stats = get_memory_usage()
69+
70+
# Record current memory usage
71+
memory_usage_rss_mb.labels(repo_url=repo_label).set(memory_stats["rss_mb"])
72+
memory_usage_vms_mb.labels(repo_url=repo_label).set(memory_stats["vms_mb"])
73+
memory_usage_percent.labels(repo_url=repo_label).set(memory_stats["percent"])
74+
75+
# Record in histogram for distribution analysis
76+
memory_consumption_histogram.labels(repo_url=repo_label).observe(memory_stats["rss_mb"])
77+
78+
return memory_stats
79+
80+
81+
def record_peak_memory_usage(repo_url: str, peak_mb: float) -> None:
82+
"""Record peak memory usage for a repository ingestion.
83+
84+
Parameters
85+
----------
86+
repo_url : str
87+
The repository URL to label the metrics with
88+
peak_mb : float
89+
Peak memory usage in MB
90+
91+
"""
92+
repo_label = repo_url[:255]
93+
peak_memory_usage_mb.labels(repo_url=repo_label).set(peak_mb)
94+
95+
96+
class MemoryTracker:
97+
"""Context manager to track memory usage during repository ingestion.
98+
99+
Parameters
100+
----------
101+
repo_url : str
102+
Repository URL for labeling metrics
103+
104+
"""
105+
106+
def __init__(self, repo_url: str) -> None:
107+
self.repo_url = repo_url
108+
self.initial_memory = 0.0
109+
self.peak_memory = 0.0
110+
111+
def __enter__(self) -> Self:
112+
"""Start memory tracking."""
113+
initial_stats = get_memory_usage()
114+
self.initial_memory = initial_stats["rss_mb"]
115+
self.peak_memory = self.initial_memory
116+
117+
# Record initial memory usage
118+
record_memory_usage(self.repo_url)
119+
120+
return self
121+
122+
def __exit__(
123+
self,
124+
exc_type: type[BaseException] | None,
125+
exc_val: BaseException | None,
126+
exc_tb: types.TracebackType | None,
127+
) -> None:
128+
"""End memory tracking and record peak usage."""
129+
# Record final memory usage
130+
final_stats = record_memory_usage(self.repo_url)
131+
132+
# Update peak if current is higher
133+
self.peak_memory = max(self.peak_memory, final_stats["rss_mb"])
134+
135+
# Record peak memory usage
136+
record_peak_memory_usage(self.repo_url, self.peak_memory)
137+
138+
def update_peak(self) -> None:
139+
"""Update peak memory if current usage is higher."""
140+
current_stats = get_memory_usage()
141+
self.peak_memory = max(self.peak_memory, current_stats["rss_mb"])
142+
143+
# Also record current usage
144+
record_memory_usage(self.repo_url)

src/server/metrics_server.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77

88
from gitingest.utils.logging_config import get_logger
99

10+
# Import to ensure memory metrics are registered
11+
from server import memory_metrics # noqa: F401 # pylint: disable=unused-import
12+
1013
# Create a logger for this module
1114
logger = get_logger(__name__)
1215

src/server/query_processor.py

Lines changed: 33 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from gitingest.utils.git_utils import resolve_commit, validate_github_token
1414
from gitingest.utils.logging_config import get_logger
1515
from gitingest.utils.pattern_utils import process_patterns
16+
from server.memory_metrics import MemoryTracker
1617
from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType, S3Metadata
1718
from server.s3_utils import (
1819
_build_s3_url,
@@ -292,36 +293,44 @@ async def process_query(
292293
return s3_response
293294

294295
clone_config = query.extract_clone_config()
295-
await clone_repo(clone_config, token=token)
296-
297296
short_repo_url = f"{query.user_name}/{query.repo_name}"
298297

299-
# The commit hash should always be available at this point
300-
if not query.commit:
301-
msg = "Unexpected error: no commit hash found"
302-
raise RuntimeError(msg)
298+
# Track memory usage during the entire ingestion process
299+
with MemoryTracker(input_text) as memory_tracker:
300+
await clone_repo(clone_config, token=token)
303301

304-
try:
305-
summary, tree, content = ingest_query(query)
302+
# Update peak memory after cloning
303+
memory_tracker.update_peak()
306304

307-
# Clean up repository immediately after ingestion to free memory
308-
_cleanup_repository(clone_config)
305+
# The commit hash should always be available at this point
306+
if not query.commit:
307+
msg = "Unexpected error: no commit hash found"
308+
raise RuntimeError(msg)
309309

310-
# Use StringIO for memory-efficient string concatenation
311-
digest_buffer = StringIO()
312310
try:
313-
digest_buffer.write(tree)
314-
digest_buffer.write("\n")
315-
digest_buffer.write(content)
316-
digest_content = digest_buffer.getvalue()
317-
finally:
318-
digest_buffer.close()
319-
_store_digest_content(query, clone_config, digest_content, summary, tree, content)
320-
except Exception as exc:
321-
_print_error(query.url, exc, max_file_size, pattern_type, pattern)
322-
# Clean up repository even if processing failed
323-
_cleanup_repository(clone_config)
324-
return IngestErrorResponse(error=f"{exc!s}")
311+
summary, tree, content = ingest_query(query)
312+
313+
# Update peak memory after ingestion (this is likely the highest usage)
314+
memory_tracker.update_peak()
315+
316+
# Clean up repository immediately after ingestion to free memory
317+
_cleanup_repository(clone_config)
318+
319+
# Use StringIO for memory-efficient string concatenation
320+
digest_buffer = StringIO()
321+
try:
322+
digest_buffer.write(tree)
323+
digest_buffer.write("\n")
324+
digest_buffer.write(content)
325+
digest_content = digest_buffer.getvalue()
326+
finally:
327+
digest_buffer.close()
328+
_store_digest_content(query, clone_config, digest_content, summary, tree, content)
329+
except Exception as exc:
330+
_print_error(query.url, exc, max_file_size, pattern_type, pattern)
331+
# Clean up repository even if processing failed
332+
_cleanup_repository(clone_config)
333+
return IngestErrorResponse(error=f"{exc!s}")
325334

326335
if len(content) > MAX_DISPLAY_SIZE:
327336
content = (

0 commit comments

Comments
 (0)