@@ -39,40 +39,62 @@ type Monitor struct {
39
39
requestGauge * prometheus.GaugeVec
40
40
41
41
cpuStats * utils.CPUStats
42
+ requests atomic.Int32
42
43
43
- pendingCPUs atomic.Float64
44
+ mu sync.Mutex
45
+ pending map [string ]* processStats
46
+ procStats map [int ]* processStats
47
+ }
48
+
49
+ type processStats struct {
50
+ egressID string
51
+
52
+ pendingUsage float64
53
+ lastUsage float64
44
54
45
- mu sync. Mutex
46
- requests atomic. Int32
47
- prevEgressUsage map [ string ] float64
55
+ totalCPU float64
56
+ cpuCounter int
57
+ maxCPU float64
48
58
}
49
59
50
- const cpuHoldDuration = time .Second * 5
60
+ const cpuHoldDuration = time .Second * 30
51
61
52
62
func NewMonitor (conf * config.ServiceConfig ) * Monitor {
53
63
return & Monitor {
54
64
cpuCostConfig : conf .CPUCostConfig ,
65
+ pending : make (map [string ]* processStats ),
66
+ procStats : make (map [int ]* processStats ),
55
67
}
56
68
}
57
69
58
70
func (m * Monitor ) Start (
59
71
conf * config.ServiceConfig ,
60
72
isIdle func () float64 ,
61
73
canAcceptRequest func () float64 ,
62
- procUpdate func (map [int ]float64 ) map [string ]float64 ,
63
74
) error {
64
75
procStats , err := utils .NewProcCPUStats (func (idle float64 , usage map [int ]float64 ) {
65
76
m .promCPULoad .Set (1 - idle / m .cpuStats .NumCPU ())
66
- egressUsage := procUpdate (usage )
67
- for egressID , cpuUsage := range egressUsage {
68
- m .promProcCPULoad .With (prometheus.Labels {"egress_id" : egressID }).Set (cpuUsage )
69
- }
70
- for egressID := range m .prevEgressUsage {
71
- if _ , ok := egressUsage [egressID ]; ! ok {
72
- m .promProcCPULoad .With (prometheus.Labels {"egress_id" : egressID }).Set (0 )
77
+
78
+ m .mu .Lock ()
79
+ defer m .mu .Unlock ()
80
+
81
+ for pid , cpuUsage := range usage {
82
+ if m .procStats [pid ] == nil {
83
+ m .procStats [pid ] = & processStats {}
84
+ }
85
+ procStats := m .procStats [pid ]
86
+
87
+ procStats .lastUsage = cpuUsage
88
+ procStats .totalCPU += cpuUsage
89
+ procStats .cpuCounter ++
90
+ if cpuUsage > procStats .maxCPU {
91
+ procStats .maxCPU = cpuUsage
92
+ }
93
+
94
+ if procStats .egressID != "" {
95
+ m .promProcCPULoad .With (prometheus.Labels {"egress_id" : procStats .egressID }).Set (cpuUsage )
73
96
}
74
97
}
75
- m .prevEgressUsage = egressUsage
76
98
})
77
99
if err != nil {
78
100
return err
@@ -168,33 +190,78 @@ func (m *Monitor) GetRequestCount() int {
168
190
return int (m .requests .Load ())
169
191
}
170
192
193
+ func (m * Monitor ) UpdatePID (egressID string , pid int ) {
194
+ m .mu .Lock ()
195
+ defer m .mu .Unlock ()
196
+
197
+ ps := m .pending [egressID ]
198
+ delete (m .pending , egressID )
199
+
200
+ if existing := m .procStats [pid ]; existing != nil {
201
+ ps .maxCPU = existing .maxCPU
202
+ ps .totalCPU = existing .totalCPU
203
+ ps .cpuCounter = existing .cpuCounter
204
+ }
205
+ m .procStats [pid ] = ps
206
+ }
207
+
208
+ func (m * Monitor ) CloseEgressStats (egressID string ) (float64 , float64 ) {
209
+ m .mu .Lock ()
210
+ defer m .mu .Unlock ()
211
+
212
+ for pid , ps := range m .procStats {
213
+ if ps .egressID == egressID {
214
+ delete (m .procStats , pid )
215
+ return ps .totalCPU / float64 (ps .cpuCounter ), ps .maxCPU
216
+ }
217
+ }
218
+
219
+ return 0 , 0
220
+ }
221
+
171
222
func (m * Monitor ) CanAcceptRequest (req * rpc.StartEgressRequest ) bool {
172
223
m .mu .Lock ()
173
224
defer m .mu .Unlock ()
174
225
175
- return m .canAcceptRequest (req )
226
+ return m .canAcceptRequestLocked (req )
176
227
}
177
228
178
- func (m * Monitor ) canAcceptRequest (req * rpc.StartEgressRequest ) bool {
229
+ func (m * Monitor ) canAcceptRequestLocked (req * rpc.StartEgressRequest ) bool {
179
230
accept := false
180
231
181
232
total := m .cpuStats .NumCPU ()
182
- available := m .cpuStats .GetCPUIdle () - m .pendingCPUs .Load ()
183
-
184
- logger .Debugw ("cpu check" ,
185
- "total" , total ,
186
- "available" , available ,
187
- "active_requests" , m .requests ,
188
- )
189
233
234
+ var available float64
190
235
if m .requests .Load () == 0 {
191
236
// if no requests, use total
192
237
available = total
193
238
} else {
239
+ var used float64
240
+ for _ , ps := range m .pending {
241
+ if ps .pendingUsage > ps .lastUsage {
242
+ used += ps .pendingUsage
243
+ } else {
244
+ used += ps .lastUsage
245
+ }
246
+ }
247
+ for _ , ps := range m .procStats {
248
+ if ps .pendingUsage > ps .lastUsage {
249
+ used += ps .pendingUsage
250
+ } else {
251
+ used += ps .lastUsage
252
+ }
253
+ }
254
+
194
255
// if already running requests, cap usage at MaxCpuUtilization
195
- available -= ( 1 - m .cpuCostConfig .MaxCpuUtilization ) * total
256
+ available = total - used - ( total * ( 1 - m .cpuCostConfig .MaxCpuUtilization ))
196
257
}
197
258
259
+ logger .Debugw ("cpu check" ,
260
+ "total" , total ,
261
+ "available" , available ,
262
+ "active_requests" , m .requests ,
263
+ )
264
+
198
265
switch req .Request .(type ) {
199
266
case * rpc.StartEgressRequest_RoomComposite :
200
267
accept = available >= m .cpuCostConfig .RoomCompositeCpuCost
@@ -215,10 +282,12 @@ func (m *Monitor) AcceptRequest(req *rpc.StartEgressRequest) error {
215
282
m .mu .Lock ()
216
283
defer m .mu .Unlock ()
217
284
218
- if ! m .canAcceptRequest (req ) {
285
+ if ! m .canAcceptRequestLocked (req ) {
219
286
return errors .ErrResourceExhausted
220
287
}
221
288
289
+ m .requests .Inc ()
290
+
222
291
var cpuHold float64
223
292
switch req .Request .(type ) {
224
293
case * rpc.StartEgressRequest_RoomComposite :
@@ -233,9 +302,13 @@ func (m *Monitor) AcceptRequest(req *rpc.StartEgressRequest) error {
233
302
cpuHold = m .cpuCostConfig .TrackCpuCost
234
303
}
235
304
236
- m .requests .Inc ()
237
- m .pendingCPUs .Add (cpuHold )
238
- time .AfterFunc (cpuHoldDuration , func () { m .pendingCPUs .Sub (cpuHold ) })
305
+ ps := & processStats {
306
+ egressID : req .EgressId ,
307
+ pendingUsage : cpuHold ,
308
+ }
309
+ time .AfterFunc (cpuHoldDuration , func () { ps .pendingUsage = 0 })
310
+ m .pending [req .EgressId ] = ps
311
+
239
312
return nil
240
313
}
241
314
0 commit comments