-
Notifications
You must be signed in to change notification settings - Fork 203
Move the capacity reporting into db_discovery #3778
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
2e62e87
3e59984
ded6c48
040ebde
3a43ba1
0156a6b
c5d7e63
3eb33fe
5a70e5d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,34 +14,33 @@ import ( | |
"github.com/livepeer/go-livepeer/core" | ||
"github.com/livepeer/go-livepeer/eth" | ||
lpTypes "github.com/livepeer/go-livepeer/eth/types" | ||
"github.com/livepeer/go-livepeer/monitor" | ||
"github.com/livepeer/go-livepeer/net" | ||
"github.com/livepeer/go-livepeer/pm" | ||
"github.com/livepeer/go-livepeer/server" | ||
|
||
"github.com/golang/glog" | ||
) | ||
|
||
var cacheRefreshInterval = 25 * time.Minute | ||
var getTicker = func() *time.Ticker { | ||
return time.NewTicker(cacheRefreshInterval) | ||
} | ||
var networkCapabilitiesReportingInterval = 25 * time.Minute | ||
|
||
type ticketParamsValidator interface { | ||
ValidateTicketParams(ticketParams *pm.TicketParams) error | ||
} | ||
|
||
type DBOrchestratorPoolCache struct { | ||
store common.OrchestratorStore | ||
lpEth eth.LivepeerEthClient | ||
ticketParamsValidator ticketParamsValidator | ||
rm common.RoundsManager | ||
bcast common.Broadcaster | ||
orchBlacklist []string | ||
discoveryTimeout time.Duration | ||
node *core.LivepeerNode | ||
store common.OrchestratorStore | ||
lpEth eth.LivepeerEthClient | ||
ticketParamsValidator ticketParamsValidator | ||
rm common.RoundsManager | ||
bcast common.Broadcaster | ||
orchBlacklist []string | ||
discoveryTimeout time.Duration | ||
node *core.LivepeerNode | ||
lastNetworkCapabilitiesReported time.Time | ||
} | ||
|
||
func NewDBOrchestratorPoolCache(ctx context.Context, node *core.LivepeerNode, rm common.RoundsManager, orchBlacklist []string, discoveryTimeout time.Duration) (*DBOrchestratorPoolCache, error) { | ||
func NewDBOrchestratorPoolCache(ctx context.Context, node *core.LivepeerNode, rm common.RoundsManager, orchBlacklist []string, discoveryTimeout time.Duration, liveAICapReportInterval time.Duration) (*DBOrchestratorPoolCache, error) { | ||
if node.Eth == nil { | ||
return nil, fmt.Errorf("could not create DBOrchestratorPoolCache: LivepeerEthClient is nil") | ||
} | ||
|
@@ -66,7 +65,7 @@ func NewDBOrchestratorPoolCache(ctx context.Context, node *core.LivepeerNode, rm | |
return err | ||
} | ||
|
||
if err := dbo.pollOrchestratorInfo(ctx); err != nil { | ||
if err := dbo.pollOrchestratorInfo(ctx, liveAICapReportInterval); err != nil { | ||
return err | ||
} | ||
return nil | ||
|
@@ -252,13 +251,13 @@ func (dbo *DBOrchestratorPoolCache) cacheOrchestratorStake() error { | |
return nil | ||
} | ||
|
||
func (dbo *DBOrchestratorPoolCache) pollOrchestratorInfo(ctx context.Context) error { | ||
func (dbo *DBOrchestratorPoolCache) pollOrchestratorInfo(ctx context.Context, liveAICapReportInterval time.Duration) error { | ||
if err := dbo.cacheOrchInfos(); err != nil { | ||
glog.Errorf("unable to poll orchestrator info: %v", err) | ||
return err | ||
} | ||
|
||
ticker := getTicker() | ||
ticker := time.NewTicker(liveAICapReportInterval) | ||
go func() { | ||
for { | ||
select { | ||
|
@@ -393,12 +392,59 @@ func (dbo *DBOrchestratorPoolCache) cacheOrchInfos() error { | |
i = numOrchs //exit loop | ||
} | ||
} | ||
//save network capabilities in LivepeerNode | ||
dbo.node.UpdateNetworkCapabilities(orchNetworkCapabilities) | ||
|
||
// Only update network capabilities every 25 minutes | ||
if time.Since(dbo.lastNetworkCapabilitiesReported) >= networkCapabilitiesReportingInterval { | ||
// Save network capabilities in LivepeerNode | ||
dbo.node.UpdateNetworkCapabilities(orchNetworkCapabilities) | ||
|
||
dbo.lastNetworkCapabilitiesReported = time.Now() | ||
} | ||
Comment on lines
+396
to
+402
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why don't we always send network capabilities? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wanted to leave the existing behaviour alone, so only sending to kafka every 25 mins, wdyt @ad-astra-video ? Going from 25 mins to 10 seconds seems wrong :) |
||
|
||
// Report AI container capacity metrics | ||
reportAICapacityFromNetworkCapabilities(orchNetworkCapabilities) | ||
|
||
return nil | ||
} | ||
|
||
func reportAICapacityFromNetworkCapabilities(orchNetworkCapabilities []*common.OrchNetworkCapabilities) { | ||
// Build structured capacity data | ||
modelCapacities := make(map[string]*monitor.ModelAICapacities) | ||
|
||
for _, orchCap := range orchNetworkCapabilities { | ||
models := getModelCapsFromNetCapabilities(orchCap.Capabilities) | ||
|
||
for modelID, model := range models { | ||
if _, exists := modelCapacities[modelID]; !exists { | ||
modelCapacities[modelID] = &monitor.ModelAICapacities{ | ||
ModelID: modelID, | ||
Orchestrators: make(map[string]monitor.AIContainerCapacity), | ||
} | ||
} | ||
|
||
capacity := monitor.AIContainerCapacity{ | ||
Idle: int(model.Capacity), | ||
InUse: int(model.CapacityInUse), | ||
} | ||
modelCapacities[modelID].Orchestrators[orchCap.OrchURI] = capacity | ||
} | ||
} | ||
|
||
monitor.ReportAIContainerCapacity(modelCapacities) | ||
} | ||
|
||
func getModelCapsFromNetCapabilities(caps *net.Capabilities) map[string]*net.Capabilities_CapabilityConstraints_ModelConstraint { | ||
if caps == nil || caps.Constraints == nil || caps.Constraints.PerCapability == nil { | ||
return nil | ||
} | ||
liveAI, ok := caps.Constraints.PerCapability[uint32(core.Capability_LiveVideoToVideo)] | ||
if !ok { | ||
return nil | ||
} | ||
|
||
return liveAI.Models | ||
} | ||
|
||
func (dbo *DBOrchestratorPoolCache) Broadcaster() common.Broadcaster { | ||
return dbo.bcast | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just a note that we need to be careful with deploying this change because if we have this flag configured in infra the gateway will fail to start because the flag does not exist anymore.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah it's a pain, i guess maybe i could leave it in but not use it, then come along after the prod deploy and remove it