1
1
from vllm .logger import init_logger
2
- from aioprometheus import Counter , Gauge , Histogram
2
+ from prometheus_client import Counter , Gauge , Histogram , REGISTRY , disable_created_metrics
3
3
4
4
import time
5
5
import numpy as np
6
- from typing import List
6
+ from typing import Dict , List
7
7
from dataclasses import dataclass
8
8
9
9
logger = init_logger (__name__ )
10
10
11
- labels = {}
12
-
13
-
14
- def add_global_metrics_labels (** kwargs ):
15
- labels .update (kwargs )
16
-
11
+ disable_created_metrics ()
17
12
18
13
# The begin-* and end* here are used by the documentation generator
19
14
# to extract the metrics definitions.
20
15
16
+
21
17
# begin-metrics-definitions
22
- gauge_avg_prompt_throughput = Gauge ("vllm:avg_prompt_throughput_toks_per_s" ,
23
- "Average prefill throughput in tokens/s." )
24
- gauge_avg_generation_throughput = Gauge (
25
- "vllm:avg_generation_throughput_toks_per_s" ,
26
- "Average generation throughput in tokens/s." )
27
- counter_prompt_tokens = Counter ("vllm:prompt_tokens_total" ,
28
- "Number of prefill tokens processed." )
29
- counter_generation_tokens = Counter ("vllm:generation_tokens_total" ,
30
- "Number of generation tokens processed." )
31
-
32
- gauge_scheduler_running = Gauge (
33
- "vllm:num_requests_running" ,
34
- "Number of requests currently running on GPU." )
35
- gauge_scheduler_swapped = Gauge ("vllm:num_requests_swapped" ,
36
- "Number of requests swapped to CPU." )
37
- gauge_scheduler_waiting = Gauge ("vllm:num_requests_waiting" ,
38
- "Number of requests waiting to be processed." )
39
-
40
- gauge_gpu_cache_usage = Gauge (
41
- "vllm:gpu_cache_usage_perc" ,
42
- "GPU KV-cache usage. 1 means 100 percent usage." )
43
- gauge_cpu_cache_usage = Gauge (
44
- "vllm:cpu_cache_usage_perc" ,
45
- "CPU KV-cache usage. 1 means 100 percent usage." )
46
-
47
- histogram_time_to_first_token = Histogram (
48
- "vllm:time_to_first_token_seconds" ,
49
- "Histogram of time to first token in seconds." ,
50
- buckets = [
51
- 0.001 , 0.005 , 0.01 , 0.02 , 0.04 , 0.06 , 0.08 , 0.1 , 0.25 , 0.5 , 0.75 , 1.0 ,
52
- 2.5 , 5.0 , 7.5 , 10.0
53
- ])
54
- histogram_time_per_output_tokens = Histogram (
55
- "vllm:time_per_output_token_seconds" ,
56
- "Histogram of time per output token in seconds." ,
57
- buckets = [
58
- 0.01 , 0.025 , 0.05 , 0.075 , 0.1 , 0.15 , 0.2 , 0.3 , 0.4 , 0.5 , 0.75 , 1.0 , 2.5
59
- ])
60
- histogram_e2e_request_latency = Histogram (
61
- "vllm:e2e_request_latency_seconds" ,
62
- "Histogram of end to end request latency in seconds." ,
63
- buckets = [1.0 , 2.5 , 5.0 , 10.0 , 15.0 , 20.0 , 30.0 , 40.0 , 50.0 , 60.0 ])
18
+ class Metrics :
19
+
20
+ def __init__ (self , labelnames : List [str ]):
21
+ # Unregister any existing vLLM collectors
22
+ for collector in list (REGISTRY ._collector_to_names ):
23
+ if hasattr (collector , "_name" ) and "vllm" in collector ._name :
24
+ REGISTRY .unregister (collector )
25
+
26
+ # System stats
27
+ self .gauge_scheduler_running = Gauge (
28
+ name = "vllm:num_requests_running" ,
29
+ documentation = "Number of requests currently running on GPU." ,
30
+ labelnames = labelnames )
31
+ self .gauge_scheduler_swapped = Gauge (
32
+ name = "vllm:num_requests_swapped" ,
33
+ documentation = "Number of requests swapped to CPU." ,
34
+ labelnames = labelnames )
35
+ self .gauge_scheduler_waiting = Gauge (
36
+ name = "vllm:num_requests_waiting" ,
37
+ documentation = "Number of requests waiting to be processed." ,
38
+ labelnames = labelnames )
39
+ self .gauge_gpu_cache_usage = Gauge (
40
+ name = "vllm:gpu_cache_usage_perc" ,
41
+ documentation = "GPU KV-cache usage. 1 means 100 percent usage." ,
42
+ labelnames = labelnames )
43
+ self .gauge_cpu_cache_usage = Gauge (
44
+ name = "vllm:cpu_cache_usage_perc" ,
45
+ documentation = "CPU KV-cache usage. 1 means 100 percent usage." ,
46
+ labelnames = labelnames )
47
+
48
+ # Raw stats from last model iteration
49
+ self .counter_prompt_tokens = Counter (
50
+ name = "vllm:prompt_tokens_total" ,
51
+ documentation = "Number of prefill tokens processed." ,
52
+ labelnames = labelnames )
53
+ self .counter_generation_tokens = Counter (
54
+ name = "vllm:generation_tokens_total" ,
55
+ documentation = "Number of generation tokens processed." ,
56
+ labelnames = labelnames )
57
+ self .histogram_time_to_first_token = Histogram (
58
+ name = "vllm:time_to_first_token_seconds" ,
59
+ documentation = "Histogram of time to first token in seconds." ,
60
+ labelnames = labelnames ,
61
+ buckets = [
62
+ 0.001 , 0.005 , 0.01 , 0.02 , 0.04 , 0.06 , 0.08 , 0.1 , 0.25 , 0.5 ,
63
+ 0.75 , 1.0 , 2.5 , 5.0 , 7.5 , 10.0
64
+ ])
65
+ self .histogram_time_per_output_token = Histogram (
66
+ name = "vllm:time_per_output_token_seconds" ,
67
+ documentation = "Histogram of time per output token in seconds." ,
68
+ labelnames = labelnames ,
69
+ buckets = [
70
+ 0.01 , 0.025 , 0.05 , 0.075 , 0.1 , 0.15 , 0.2 , 0.3 , 0.4 , 0.5 , 0.75 ,
71
+ 1.0 , 2.5
72
+ ])
73
+ self .histogram_e2e_request_latency = Histogram (
74
+ name = "vllm:e2e_request_latency_seconds" ,
75
+ documentation = "Histogram of end to end request latency in seconds." ,
76
+ labelnames = labelnames ,
77
+ buckets = [1.0 , 2.5 , 5.0 , 10.0 , 15.0 , 20.0 , 30.0 , 40.0 , 50.0 , 60.0 ])
78
+
79
+ # Legacy metrics
80
+ self .gauge_avg_prompt_throughput = Gauge (
81
+ name = "vllm:avg_prompt_throughput_toks_per_s" ,
82
+ documentation = "Average prefill throughput in tokens/s." ,
83
+ labelnames = labelnames ,
84
+ )
85
+ self .gauge_avg_generation_throughput = Gauge (
86
+ name = "vllm:avg_generation_throughput_toks_per_s" ,
87
+ documentation = "Average generation throughput in tokens/s." ,
88
+ labelnames = labelnames ,
89
+ )
90
+
91
+
64
92
# end-metrics-definitions
65
93
66
94
@@ -87,7 +115,7 @@ class Stats:
87
115
class StatLogger :
88
116
"""StatLogger is used LLMEngine to log to Promethus and Stdout."""
89
117
90
- def __init__ (self , local_interval : float ) -> None :
118
+ def __init__ (self , local_interval : float , labels : Dict [ str , str ] ) -> None :
91
119
# Metadata for logging locally.
92
120
self .last_local_log = time .monotonic ()
93
121
self .local_interval = local_interval
@@ -96,6 +124,10 @@ def __init__(self, local_interval: float) -> None:
96
124
self .num_prompt_tokens : List [int ] = []
97
125
self .num_generation_tokens : List [int ] = []
98
126
127
+ # Prometheus metrics
128
+ self .labels = labels
129
+ self .metrics = Metrics (labelnames = list (labels .keys ()))
130
+
99
131
def _get_throughput (self , tracked_stats : List [int ], now : float ) -> float :
100
132
return float (np .sum (tracked_stats ) / (now - self .last_local_log ))
101
133
@@ -105,23 +137,33 @@ def _local_interval_elapsed(self, now: float) -> bool:
105
137
106
138
def _log_prometheus (self , stats : Stats ) -> None :
107
139
# Set system stat gauges.
108
- gauge_scheduler_running .set (labels , stats .num_running )
109
- gauge_scheduler_swapped .set (labels , stats .num_swapped )
110
- gauge_scheduler_waiting .set (labels , stats .num_waiting )
111
- gauge_gpu_cache_usage .set (labels , stats .gpu_cache_usage )
112
- gauge_cpu_cache_usage .set (labels , stats .cpu_cache_usage )
140
+ self .metrics .gauge_scheduler_running .labels (** self .labels ).set (
141
+ stats .num_running )
142
+ self .metrics .gauge_scheduler_swapped .labels (** self .labels ).set (
143
+ stats .num_swapped )
144
+ self .metrics .gauge_scheduler_waiting .labels (** self .labels ).set (
145
+ stats .num_waiting )
146
+ self .metrics .gauge_gpu_cache_usage .labels (** self .labels ).set (
147
+ stats .gpu_cache_usage )
148
+ self .metrics .gauge_cpu_cache_usage .labels (** self .labels ).set (
149
+ stats .cpu_cache_usage )
113
150
114
151
# Add to token counters.
115
- counter_prompt_tokens .add (labels , stats .num_prompt_tokens )
116
- counter_generation_tokens .add (labels , stats .num_generation_tokens )
152
+ self .metrics .counter_prompt_tokens .labels (** self .labels ).inc (
153
+ stats .num_prompt_tokens )
154
+ self .metrics .counter_generation_tokens .labels (** self .labels ).inc (
155
+ stats .num_generation_tokens )
117
156
118
157
# Observe request level latencies in histograms.
119
158
for ttft in stats .time_to_first_tokens :
120
- histogram_time_to_first_token .observe (labels , ttft )
159
+ self .metrics .histogram_time_to_first_token .labels (
160
+ ** self .labels ).observe (ttft )
121
161
for tpot in stats .time_per_output_tokens :
122
- histogram_time_per_output_tokens .observe (labels , tpot )
162
+ self .metrics .histogram_time_per_output_token .labels (
163
+ ** self .labels ).observe (tpot )
123
164
for e2e in stats .time_e2e_requests :
124
- histogram_e2e_request_latency .observe (labels , e2e )
165
+ self .metrics .histogram_e2e_request_latency .labels (
166
+ ** self .labels ).observe (e2e )
125
167
126
168
def _log_prometheus_interval (self , prompt_throughput : float ,
127
169
generation_throughput : float ) -> None :
@@ -130,8 +172,10 @@ def _log_prometheus_interval(self, prompt_throughput: float,
130
172
# Moving forward, we should use counters like counter_prompt_tokens, counter_generation_tokens
131
173
# Which log raw data and calculate summaries using rate() on the grafana/prometheus side.
132
174
# See https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666
133
- gauge_avg_prompt_throughput .set (labels , prompt_throughput )
134
- gauge_avg_generation_throughput .set (labels , generation_throughput )
175
+ self .metrics .gauge_avg_prompt_throughput .labels (
176
+ ** self .labels ).set (prompt_throughput )
177
+ self .metrics .gauge_avg_generation_throughput .labels (
178
+ ** self .labels ).set (generation_throughput )
135
179
136
180
def log (self , stats : Stats ) -> None :
137
181
"""Called by LLMEngine.
0 commit comments