From 6b53a1736b866bb4c1792b53d0e59685b37d5be8 Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Thu, 18 Dec 2025 10:22:26 +0000 Subject: [PATCH 01/50] abstract underlying memory store --- .../pkg/bridge/event-data-store-bridge.go | 7 + scheduler/pkg/store/memory.go | 297 +++++++++++++----- scheduler/pkg/store/mesh.go | 5 + scheduler/pkg/store/storage.go | 20 ++ scheduler/pkg/store/store.go | 1 + 5 files changed, 246 insertions(+), 84 deletions(-) create mode 100644 scheduler/pkg/bridge/event-data-store-bridge.go create mode 100644 scheduler/pkg/store/storage.go diff --git a/scheduler/pkg/bridge/event-data-store-bridge.go b/scheduler/pkg/bridge/event-data-store-bridge.go new file mode 100644 index 0000000000..df462a561f --- /dev/null +++ b/scheduler/pkg/bridge/event-data-store-bridge.go @@ -0,0 +1,7 @@ +package bridge + +import "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" + +type EventDataStoreBridge interface { + AddServerReplica(models []coordinator.ModelEventMsg, msg coordinator.ServerEventMsg) error +} diff --git a/scheduler/pkg/store/memory.go b/scheduler/pkg/store/memory.go index 4a0611c92c..4a6b26823a 100644 --- a/scheduler/pkg/store/memory.go +++ b/scheduler/pkg/store/memory.go @@ -10,10 +10,12 @@ the Change License after the Change Date as each is defined in accordance with t package store import ( + "errors" "fmt" "sort" "sync" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/bridge" log "github.com/sirupsen/logrus" "github.com/seldonio/seldon-core/apis/go/v2/mlops/agent" @@ -26,14 +28,15 @@ import ( type MemoryStore struct { mu sync.RWMutex opLocks sync.Map - store *LocalSchedulerStore + store Storage logger log.FieldLogger eventHub *coordinator.EventHub + bridge bridge.EventDataStoreBridge } func NewMemoryStore( logger log.FieldLogger, - store *LocalSchedulerStore, + store Storage, eventHub *coordinator.EventHub, ) *MemoryStore { return &MemoryStore{ @@ -43,24 +46,35 @@ func NewMemoryStore( } } -func (m *MemoryStore) GetAllModels() []string { +func (m *MemoryStore) GetAllModels() ([]string, error) { m.mu.RLock() defer m.mu.RUnlock() var modelNames []string - for modelName := range m.store.models { - modelNames = append(modelNames, modelName) + + models, err := m.store.ListModels() + if err != nil { + return nil, err } - return modelNames + + for _, model := range models { + modelNames = append(modelNames, model.Name()) + } + return modelNames, nil } func (m *MemoryStore) GetModels() ([]*ModelSnapshot, error) { m.mu.RLock() defer m.mu.RUnlock() + models, err := m.store.ListModels() + if err != nil { + return nil, err + } + foundModels := []*ModelSnapshot{} - for name, model := range m.store.models { + for _, model := range models { snapshot := &ModelSnapshot{ - Name: name, + Name: model.Name(), Deleted: model.IsDeleted(), Versions: model.versions, } @@ -69,12 +83,18 @@ func (m *MemoryStore) GetModels() ([]*ModelSnapshot, error) { return foundModels, nil } -func (m *MemoryStore) addModelVersionIfNotExists(req *agent.ModelVersion) (*Model, *ModelVersion) { +func (m *MemoryStore) addModelVersionIfNotExists(req *agent.ModelVersion) (*Model, *ModelVersion, error) { modelName := req.GetModel().GetMeta().GetName() - model, ok := m.store.models[modelName] - if !ok { + model, err := m.store.GetModel(modelName) + if err != nil { + if !errors.Is(err, ErrNotFound) { + return nil, nil, err + } + // TODO pointer save issue model = &Model{} - m.store.models[modelName] = model + if err := m.store.AddModel(model); err != nil { + return nil, nil, err + } } if existingModelVersion := model.GetVersion(req.GetVersion()); existingModelVersion == nil { modelVersion := NewDefaultModelVersion(req.GetModel(), req.GetVersion()) @@ -82,9 +102,9 @@ func (m *MemoryStore) addModelVersionIfNotExists(req *agent.ModelVersion) (*Mode sort.SliceStable(model.versions, func(i, j int) bool { // resort model versions based on version number return model.versions[i].GetVersion() < model.versions[j].GetVersion() }) - return model, modelVersion + return model, modelVersion, nil } else { - return model, existingModelVersion + return model, existingModelVersion, nil } } @@ -117,15 +137,28 @@ func (m *MemoryStore) UpdateModel(req *pb.LoadModelRequest) error { modelName, ) } - model, ok := m.store.models[modelName] - if !ok { - model = &Model{} - m.store.models[modelName] = model + model, err := m.store.GetModel(modelName) + if err != nil { + if !errors.Is(err, ErrNotFound) { + return err + } + // TODO pointer + model = &Model{ + name: modelName, + } + if err := m.store.AddModel(model); err != nil { + return err + } m.addNextModelVersion(model, req.GetModel()) } else if model.IsDeleted() { if model.Inactive() { - model = &Model{} - m.store.models[modelName] = model + // TODO pointer + model = &Model{ + name: modelName, + } + if err := m.store.AddModel(model); err != nil { + return err + } m.addNextModelVersion(model, req.GetModel()) } else { return fmt.Errorf( @@ -158,8 +191,8 @@ func (m *MemoryStore) UpdateModel(req *pb.LoadModelRequest) error { } func (m *MemoryStore) getModelImpl(key string) *ModelSnapshot { - model, ok := m.store.models[key] - if ok { + model, err := m.store.GetModel(key) + if err == nil { return m.deepCopy(model, key) } @@ -216,8 +249,8 @@ func (m *MemoryStore) removeModelImpl(req *pb.UnloadModelRequest) error { m.mu.Lock() defer m.mu.Unlock() modelName := req.GetModel().GetName() - model, ok := m.store.models[modelName] - if ok { + model, err := m.store.GetModel(modelName) + if err == nil { // Updating the k8s meta is required to be updated so status updates back (to manager) // will match latest generation value. Previous generation values might be ignored by manager. if req.GetKubernetesMeta() != nil { // k8s meta can be nil if unload is called directly using scheduler grpc api @@ -228,43 +261,69 @@ func (m *MemoryStore) removeModelImpl(req *pb.UnloadModelRequest) error { m.setModelGwStatusToTerminate(true, model.Latest()) m.updateModelStatus(true, true, model.Latest(), model.GetLastAvailableModelVersion()) return nil - } else { + } + + if errors.Is(err, ErrNotFound) { return fmt.Errorf("Model %s not found", req.GetModel().GetName()) } + return err } func (m *MemoryStore) GetServers(shallow bool, modelDetails bool) ([]*ServerSnapshot, error) { m.mu.RLock() defer m.mu.RUnlock() - var servers []*ServerSnapshot - for _, server := range m.store.servers { - servers = append(servers, server.CreateSnapshot(shallow, modelDetails)) + + servers, err := m.store.ListServers() + if err != nil { + return nil, err } - return servers, nil + + snapshots := make([]*ServerSnapshot, 0, len(servers)) + for _, server := range servers { + snapshots = append(snapshots, server.CreateSnapshot(shallow, modelDetails)) + } + return snapshots, nil } func (m *MemoryStore) GetServer(serverKey string, shallow bool, modelDetails bool) (*ServerSnapshot, error) { m.mu.RLock() defer m.mu.RUnlock() - server := m.store.servers[serverKey] - if server == nil { - return nil, fmt.Errorf("Server [%s] not found", serverKey) - } else { - // TODO: refactor cleanly - snapshot := server.CreateSnapshot(shallow, modelDetails) - if modelDetails { - // this is a hint to the caller that the server is in a state where it can be scaled down - snapshot.Stats = m.getServerStats(serverKey) + + server, err := m.store.GetServer(serverKey) + if err != nil { + if errors.Is(err, ErrNotFound) { + return nil, fmt.Errorf("Server [%s] not found", serverKey) + } + return nil, err + } + + // TODO: refactor cleanly + snapshot := server.CreateSnapshot(shallow, modelDetails) + if modelDetails { + // this is a hint to the caller that the server is in a state where it can be scaled down + snapshot.Stats, err = m.getServerStats(serverKey) + if err != nil { + return nil, err } - return snapshot, nil } + return snapshot, nil } -func (m *MemoryStore) getServerStats(serverKey string) *ServerStats { - return &ServerStats{ - NumEmptyReplicas: m.numEmptyServerReplicas(serverKey), - MaxNumReplicaHostedModels: m.maxNumModelReplicasForServer(serverKey), +func (m *MemoryStore) getServerStats(serverKey string) (*ServerStats, error) { + emptyReplicas, err := m.numEmptyServerReplicas(serverKey) + if err != nil { + return nil, err + } + + maxModelReplicas, err := m.maxNumModelReplicasForServer(serverKey) + if err != nil { + return nil, err } + + return &ServerStats{ + NumEmptyReplicas: emptyReplicas, + MaxNumReplicaHostedModels: maxModelReplicas, + }, nil } func (m *MemoryStore) getModelServer( @@ -273,16 +332,16 @@ func (m *MemoryStore) getModelServer( serverKey string, ) (*Model, *ModelVersion, *Server, error) { // Validate - model, ok := m.store.models[modelKey] - if !ok { + model, err := m.store.GetModel(modelKey) + if err != nil { return nil, nil, nil, fmt.Errorf("failed to find model %s", modelKey) } modelVersion := model.GetVersion(version) if modelVersion == nil { return nil, nil, nil, fmt.Errorf("Version not found for model %s, version %d", modelKey, version) } - server, ok := m.store.servers[serverKey] - if !ok { + server, err := m.store.GetServer(serverKey) + if err != nil { return nil, nil, nil, fmt.Errorf("failed to find server %s", serverKey) } return model, modelVersion, server, nil @@ -318,8 +377,12 @@ func (m *MemoryStore) updateLoadedModelsImpl( logger := m.logger.WithField("func", "updateLoadedModelsImpl") // Validate - model, ok := m.store.models[modelKey] - if !ok { + // TODO pointer issue + model, err := m.store.GetModel(modelKey) + if err != nil { + if errors.Is(err, ErrNotFound) { + return nil, fmt.Errorf("Model [%s] not found", modelKey) + } return nil, fmt.Errorf("failed to find model %s", modelKey) } @@ -339,8 +402,8 @@ func (m *MemoryStore) updateLoadedModelsImpl( }, nil } - server, ok := m.store.servers[serverKey] - if !ok { + server, err := m.store.GetServer(serverKey) + if err != nil { return nil, fmt.Errorf("failed to find server %s", serverKey) } @@ -446,8 +509,9 @@ func (m *MemoryStore) unloadVersionModelsImpl(modelKey string, version uint32) ( defer m.mu.Unlock() // Validate - model, ok := m.store.models[modelKey] - if !ok { + // TODO pointer issue + model, err := m.store.GetModel(modelKey) + if err != nil { return nil, false, fmt.Errorf("failed to find model %s", modelKey) } modelVersion := model.GetVersion(version) @@ -563,8 +627,10 @@ func (m *MemoryStore) updateModelStateImpl( // Update models loaded onto replica for relevant state if desiredState == Loaded || desiredState == Loading || desiredState == Unloaded || desiredState == LoadFailed { - server, ok := m.store.servers[serverKey] - if ok { + // TODO pointer + server, err := m.store.GetServer(serverKey) + // TODO handle error + if err == nil { replica, ok := server.replicas[replicaIdx] if ok { if desiredState == Loaded || desiredState == Loading { @@ -616,8 +682,9 @@ func (m *MemoryStore) updateReservedMemory( ) { // update reserved memory that is being used for sorting replicas // do we need to lock replica update? - server, ok := m.store.servers[serverKey] - if ok { + server, err := m.store.GetServer(serverKey) + // TODO handle error?? + if err == nil { replica, okReplica := server.replicas[replicaIdx] if okReplica { if modelReplicaState == LoadRequested { @@ -654,10 +721,16 @@ func (m *MemoryStore) addServerReplicaImpl(request *agent.AgentSubscribeRequest) m.mu.Lock() defer m.mu.Unlock() - server, ok := m.store.servers[request.ServerName] - if !ok { + // TODO pointer issue + server, err := m.store.GetServer(request.ServerName) + if err != nil { + if !errors.Is(err, ErrNotFound) { + return nil, coordinator.ServerEventMsg{}, err + } server = NewServer(request.ServerName, request.Shared) - m.store.servers[request.ServerName] = server + if err := m.store.AddServer(server); err != nil { + return nil, coordinator.ServerEventMsg{}, fmt.Errorf("failed to add server %s: %w", request.ServerName, err) + } } server.shared = request.Shared @@ -674,7 +747,10 @@ func (m *MemoryStore) addServerReplicaImpl(request *agent.AgentSubscribeRequest) var evts []coordinator.ModelEventMsg for _, modelVersionReq := range request.LoadedModels { - model, modelVersion := m.addModelVersionIfNotExists(modelVersionReq) + model, modelVersion, err := m.addModelVersionIfNotExists(modelVersionReq) + if err != nil { + return nil, coordinator.ServerEventMsg{}, err + } modelVersion.replicas[int(request.ReplicaIdx)] = ReplicaStatus{State: Loaded} modelVersion.SetServer(request.ServerName) m.updateModelStatus(true, false, modelVersion, model.GetLastAvailableModelVersion()) @@ -713,9 +789,9 @@ func (m *MemoryStore) removeServerReplicaImpl(serverName string, replicaIdx int) m.mu.Lock() defer m.mu.Unlock() - server, ok := m.store.servers[serverName] - if !ok { - return nil, nil, fmt.Errorf("Failed to find server %s", serverName) + server, err := m.store.GetServer(serverName) + if err != nil { + return nil, nil, fmt.Errorf("failed to find server %s: %w", serverName, err) } serverReplica, ok := server.replicas[replicaIdx] if !ok { @@ -724,7 +800,9 @@ func (m *MemoryStore) removeServerReplicaImpl(serverName string, replicaIdx int) delete(server.replicas, replicaIdx) // TODO we should not reschedule models on servers with dedicated models, e.g. non shareable servers if len(server.replicas) == 0 { - delete(m.store.servers, serverName) + if err := m.store.DeleteServer(serverName); err != nil { + return nil, nil, fmt.Errorf("failed to delete server %s: %w", serverName, err) + } } loadedModelsRemoved, loadedEvts := m.removeModelfromServerReplica(serverReplica.loadedModels, replicaIdx) loadingModelsRemoved, loadingEtvs := m.removeModelfromServerReplica(serverReplica.loadingModels, replicaIdx) @@ -741,8 +819,9 @@ func (m *MemoryStore) removeModelfromServerReplica(lModels map[ModelVersionID]bo var evts []coordinator.ModelEventMsg // Find models to reschedule due to this server replica being removed for modelVersionID := range lModels { - model, ok := m.store.models[modelVersionID.Name] - if ok { + // TODO pointer issue + model, err := m.store.GetModel(modelVersionID.Name) + if err == nil { modelVersion := model.GetVersion(modelVersionID.Version) if modelVersion != nil { modelVersion.DeleteReplica(replicaIdx) @@ -787,8 +866,9 @@ func (m *MemoryStore) DrainServerReplica(serverName string, replicaIdx int) ([]s func (m *MemoryStore) drainServerReplicaImpl(serverName string, replicaIdx int) ([]string, error) { logger := m.logger.WithField("func", "DrainServerReplica") - server, ok := m.store.servers[serverName] - if !ok { + // TODO pointer issue + server, err := m.store.GetServer(serverName) + if err != nil { return nil, fmt.Errorf("Failed to find server %s", serverName) } serverReplica, ok := server.replicas[replicaIdx] @@ -816,8 +896,10 @@ func (m *MemoryStore) findModelsToReSchedule(models map[ModelVersionID]bool, rep modelsReSchedule := make([]string, 0) for modelVersionID := range models { - model, ok := m.store.models[modelVersionID.Name] - if ok { + // TODO pointer issue + // TODO handle error? + model, err := m.store.GetModel(modelVersionID.Name) + if err == nil { modelVersion := model.GetVersion(modelVersionID.Version) if modelVersion != nil { modelVersion.SetReplicaState(replicaIdx, Draining, "trigger to drain") @@ -838,10 +920,15 @@ func (m *MemoryStore) ServerNotify(request *pb.ServerNotify) error { logger.Debugf("ServerNotify %v", request) - server, ok := m.store.servers[request.Name] - if !ok { + server, err := m.store.GetServer(request.Name) + if err != nil { + if !errors.Is(err, ErrNotFound) { + return err + } server = NewServer(request.Name, request.Shared) - m.store.servers[request.Name] = server + if err := m.store.AddServer(server); err != nil { + return err + } } server.SetExpectedReplicas(int(request.ExpectedReplicas)) server.SetMinReplicas(int(request.MinReplicas)) @@ -850,29 +937,37 @@ func (m *MemoryStore) ServerNotify(request *pb.ServerNotify) error { return nil } -func (m *MemoryStore) numEmptyServerReplicas(serverName string) uint32 { +func (m *MemoryStore) numEmptyServerReplicas(serverName string) (uint32, error) { emptyReplicas := uint32(0) - server, ok := m.store.servers[serverName] - if !ok { - return emptyReplicas + server, err := m.store.GetServer(serverName) + if err != nil { + if !errors.Is(err, ErrNotFound) { + return 0, err + } + return emptyReplicas, nil } for _, replica := range server.replicas { if len(replica.GetLoadedOrLoadingModelVersions()) == 0 { emptyReplicas++ } } - return emptyReplicas + return emptyReplicas, nil } -func (m *MemoryStore) maxNumModelReplicasForServer(serverName string) uint32 { +func (m *MemoryStore) maxNumModelReplicasForServer(serverName string) (uint32, error) { + models, err := m.store.ListModels() + if err != nil { + return 0, err + } + maxNumModels := uint32(0) - for _, model := range m.store.models { + for _, model := range models { latest := model.Latest() if latest != nil && latest.Server() == serverName { maxNumModels = max(maxNumModels, uint32(latest.DesiredReplicas())) } } - return maxNumModels + return maxNumModels, nil } func toSchedulerLoadedModels(agentLoadedModels []*agent.ModelVersion) map[ModelVersionID]bool { @@ -911,8 +1006,9 @@ func (m *MemoryStore) setModelGwModelStateImpl(name string, versionNumber uint32 m.mu.Lock() defer m.mu.Unlock() - model, ok := m.store.models[name] - if !ok { + // TODO pointer issue + model, err := m.store.GetModel(name) + if err != nil { return nil, fmt.Errorf("failed to find model %s", name) } modelVersion := model.GetVersion(versionNumber) @@ -932,3 +1028,36 @@ func (m *MemoryStore) setModelGwModelStateImpl(name string, versionNumber uint32 } return evts, nil } + +func (m *MemoryStore) EmitEvents() error { + servers, err := m.store.ListServers() + if err != nil { + return err + } + + for _, server := range servers { + for id := range server.replicas { + m.eventHub.PublishServerEvent(serverUpdateEventSource, coordinator.ServerEventMsg{ + ServerName: server.name, + ServerIdx: uint32(id), + Source: serverUpdateEventSource, + UpdateContext: coordinator.SERVER_REPLICA_CONNECTED, // TODO can we be confident of that? + }) + } + } + + models, err := m.store.ListModels() + if err != nil { + return err + } + + for _, model := range models { + m.eventHub.PublishModelEvent(modelUpdateEventSource, coordinator.ModelEventMsg{ + ModelName: model.Name(), + Source: modelUpdateEventSource, + ModelVersion: model.GetLastAvailableModelVersion().version, + }) + } + + return nil +} diff --git a/scheduler/pkg/store/mesh.go b/scheduler/pkg/store/mesh.go index 47025fe992..ac0360a5c3 100644 --- a/scheduler/pkg/store/mesh.go +++ b/scheduler/pkg/store/mesh.go @@ -37,6 +37,7 @@ func NewLocalSchedulerStore() *LocalSchedulerStore { } type Model struct { + name string versions []*ModelVersion deleted atomic.Bool } @@ -318,6 +319,10 @@ func (m ModelReplicaState) IsLoadingOrLoaded() bool { return (m == Loaded || m == LoadRequested || m == Loading || m == Available || m == LoadedUnavailable) } +func (m *Model) Name() string { + return m.name +} + func (m *Model) HasLatest() bool { return len(m.versions) > 0 } diff --git a/scheduler/pkg/store/storage.go b/scheduler/pkg/store/storage.go new file mode 100644 index 0000000000..eadc3f7f2f --- /dev/null +++ b/scheduler/pkg/store/storage.go @@ -0,0 +1,20 @@ +package store + +import "errors" + +var ( + ErrNotFound = errors.New("not found") +) + +type Storage interface { + GetModel(name string) (*Model, error) + AddModel(model *Model) error + ListModels() ([]*Model, error) + UpdateModel(model *Model) error + DeleteModel(name string) error + GetServer(name string) (*Server, error) + AddServer(server *Server) error + ListServers() ([]*Server, error) + UpdateServer(server *Server) error + DeleteServer(serverName string) error +} diff --git a/scheduler/pkg/store/store.go b/scheduler/pkg/store/store.go index 8bbc2a627e..3552ed2335 100644 --- a/scheduler/pkg/store/store.go +++ b/scheduler/pkg/store/store.go @@ -156,4 +156,5 @@ type ModelStore interface { FailedScheduling(modelID string, version uint32, reason string, reset bool) error GetAllModels() []string SetModelGwModelState(name string, versionNumber uint32, status ModelState, reason string, source string) error + EmitEvents() error } From 570a87daf7d70ebbd3e4a1add6714d4271c9db72 Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Mon, 29 Dec 2025 11:53:36 +0000 Subject: [PATCH 02/50] pb for storage --- apis/Makefile | 1 + apis/go/mlops/scheduler/db/db.pb.go | 1336 +++++++++++++++++ apis/mlops/scheduler/db/db.proto | 125 ++ apis/mlops/scheduler/storage.proto | 1 + .../pkg/bridge/event-data-store-bridge.go | 7 - scheduler/pkg/store/memory.go | 4 +- scheduler/pkg/store/mesh.go | 4 - scheduler/pkg/store/mock/store.go | 19 +- scheduler/pkg/store/storage.go | 9 + scheduler/pkg/store/store.go | 2 +- 10 files changed, 1492 insertions(+), 16 deletions(-) create mode 100644 apis/go/mlops/scheduler/db/db.pb.go create mode 100644 apis/mlops/scheduler/db/db.proto delete mode 100644 scheduler/pkg/bridge/event-data-store-bridge.go diff --git a/apis/Makefile b/apis/Makefile index eb28b90b83..4f75133c1f 100644 --- a/apis/Makefile +++ b/apis/Makefile @@ -163,6 +163,7 @@ build-go: \ ./mlops/proxy/proxy.proto \ ./mlops/scheduler/scheduler.proto \ ./mlops/scheduler/storage.proto \ + ./mlops/scheduler/db/db.proto \ ./mlops/chainer/chainer.proto \ ./mlops/v2_dataplane/v2_dataplane.proto \ ./mlops/health/health.proto \ diff --git a/apis/go/mlops/scheduler/db/db.pb.go b/apis/go/mlops/scheduler/db/db.pb.go new file mode 100644 index 0000000000..30f89c659a --- /dev/null +++ b/apis/go/mlops/scheduler/db/db.pb.go @@ -0,0 +1,1336 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed BY +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.34.2 +// protoc v5.27.2 +// source: mlops/scheduler/db/db.proto + +package db + +import ( + scheduler "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" + timestamppb "google.golang.org/protobuf/types/known/timestamppb" + reflect "reflect" + sync "sync" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +// ModelState represents the state of a model +type ModelState int32 + +const ( + ModelState_MODEL_STATE_UNKNOWN ModelState = 0 + ModelState_MODEL_STATE_PROGRESSING ModelState = 1 + ModelState_MODEL_STATE_AVAILABLE ModelState = 2 + ModelState_MODEL_STATE_FAILED ModelState = 3 + ModelState_MODEL_STATE_TERMINATING ModelState = 4 + ModelState_MODEL_STATE_TERMINATED ModelState = 5 + ModelState_MODEL_STATE_TERMINATE_FAILED ModelState = 6 + ModelState_MODEL_STATE_SCHEDULE_FAILED ModelState = 7 + ModelState_MODEL_STATE_SCALED_DOWN ModelState = 8 + ModelState_MODEL_STATE_CREATE ModelState = 9 + ModelState_MODEL_STATE_TERMINATE ModelState = 10 +) + +// Enum value maps for ModelState. +var ( + ModelState_name = map[int32]string{ + 0: "MODEL_STATE_UNKNOWN", + 1: "MODEL_STATE_PROGRESSING", + 2: "MODEL_STATE_AVAILABLE", + 3: "MODEL_STATE_FAILED", + 4: "MODEL_STATE_TERMINATING", + 5: "MODEL_STATE_TERMINATED", + 6: "MODEL_STATE_TERMINATE_FAILED", + 7: "MODEL_STATE_SCHEDULE_FAILED", + 8: "MODEL_STATE_SCALED_DOWN", + 9: "MODEL_STATE_CREATE", + 10: "MODEL_STATE_TERMINATE", + } + ModelState_value = map[string]int32{ + "MODEL_STATE_UNKNOWN": 0, + "MODEL_STATE_PROGRESSING": 1, + "MODEL_STATE_AVAILABLE": 2, + "MODEL_STATE_FAILED": 3, + "MODEL_STATE_TERMINATING": 4, + "MODEL_STATE_TERMINATED": 5, + "MODEL_STATE_TERMINATE_FAILED": 6, + "MODEL_STATE_SCHEDULE_FAILED": 7, + "MODEL_STATE_SCALED_DOWN": 8, + "MODEL_STATE_CREATE": 9, + "MODEL_STATE_TERMINATE": 10, + } +) + +func (x ModelState) Enum() *ModelState { + p := new(ModelState) + *p = x + return p +} + +func (x ModelState) String() string { + return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) +} + +func (ModelState) Descriptor() protoreflect.EnumDescriptor { + return file_mlops_scheduler_db_db_proto_enumTypes[0].Descriptor() +} + +func (ModelState) Type() protoreflect.EnumType { + return &file_mlops_scheduler_db_db_proto_enumTypes[0] +} + +func (x ModelState) Number() protoreflect.EnumNumber { + return protoreflect.EnumNumber(x) +} + +// Deprecated: Use ModelState.Descriptor instead. +func (ModelState) EnumDescriptor() ([]byte, []int) { + return file_mlops_scheduler_db_db_proto_rawDescGZIP(), []int{0} +} + +// ModelReplicaState represents the state of a model replica +type ModelReplicaState int32 + +const ( + ModelReplicaState_MODEL_REPLICA_STATE_UNKNOWN ModelReplicaState = 0 + ModelReplicaState_MODEL_REPLICA_STATE_LOAD_REQUESTED ModelReplicaState = 1 + ModelReplicaState_MODEL_REPLICA_STATE_LOADING ModelReplicaState = 2 + ModelReplicaState_MODEL_REPLICA_STATE_LOADED ModelReplicaState = 3 + ModelReplicaState_MODEL_REPLICA_STATE_LOAD_FAILED ModelReplicaState = 4 + ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_ENVOY_REQUESTED ModelReplicaState = 5 + ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_REQUESTED ModelReplicaState = 6 + ModelReplicaState_MODEL_REPLICA_STATE_UNLOADING ModelReplicaState = 7 + ModelReplicaState_MODEL_REPLICA_STATE_UNLOADED ModelReplicaState = 8 + ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_FAILED ModelReplicaState = 9 + ModelReplicaState_MODEL_REPLICA_STATE_AVAILABLE ModelReplicaState = 10 + ModelReplicaState_MODEL_REPLICA_STATE_LOADED_UNAVAILABLE ModelReplicaState = 11 + ModelReplicaState_MODEL_REPLICA_STATE_DRAINING ModelReplicaState = 12 +) + +// Enum value maps for ModelReplicaState. +var ( + ModelReplicaState_name = map[int32]string{ + 0: "MODEL_REPLICA_STATE_UNKNOWN", + 1: "MODEL_REPLICA_STATE_LOAD_REQUESTED", + 2: "MODEL_REPLICA_STATE_LOADING", + 3: "MODEL_REPLICA_STATE_LOADED", + 4: "MODEL_REPLICA_STATE_LOAD_FAILED", + 5: "MODEL_REPLICA_STATE_UNLOAD_ENVOY_REQUESTED", + 6: "MODEL_REPLICA_STATE_UNLOAD_REQUESTED", + 7: "MODEL_REPLICA_STATE_UNLOADING", + 8: "MODEL_REPLICA_STATE_UNLOADED", + 9: "MODEL_REPLICA_STATE_UNLOAD_FAILED", + 10: "MODEL_REPLICA_STATE_AVAILABLE", + 11: "MODEL_REPLICA_STATE_LOADED_UNAVAILABLE", + 12: "MODEL_REPLICA_STATE_DRAINING", + } + ModelReplicaState_value = map[string]int32{ + "MODEL_REPLICA_STATE_UNKNOWN": 0, + "MODEL_REPLICA_STATE_LOAD_REQUESTED": 1, + "MODEL_REPLICA_STATE_LOADING": 2, + "MODEL_REPLICA_STATE_LOADED": 3, + "MODEL_REPLICA_STATE_LOAD_FAILED": 4, + "MODEL_REPLICA_STATE_UNLOAD_ENVOY_REQUESTED": 5, + "MODEL_REPLICA_STATE_UNLOAD_REQUESTED": 6, + "MODEL_REPLICA_STATE_UNLOADING": 7, + "MODEL_REPLICA_STATE_UNLOADED": 8, + "MODEL_REPLICA_STATE_UNLOAD_FAILED": 9, + "MODEL_REPLICA_STATE_AVAILABLE": 10, + "MODEL_REPLICA_STATE_LOADED_UNAVAILABLE": 11, + "MODEL_REPLICA_STATE_DRAINING": 12, + } +) + +func (x ModelReplicaState) Enum() *ModelReplicaState { + p := new(ModelReplicaState) + *p = x + return p +} + +func (x ModelReplicaState) String() string { + return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) +} + +func (ModelReplicaState) Descriptor() protoreflect.EnumDescriptor { + return file_mlops_scheduler_db_db_proto_enumTypes[1].Descriptor() +} + +func (ModelReplicaState) Type() protoreflect.EnumType { + return &file_mlops_scheduler_db_db_proto_enumTypes[1] +} + +func (x ModelReplicaState) Number() protoreflect.EnumNumber { + return protoreflect.EnumNumber(x) +} + +// Deprecated: Use ModelReplicaState.Descriptor instead. +func (ModelReplicaState) EnumDescriptor() ([]byte, []int) { + return file_mlops_scheduler_db_db_proto_rawDescGZIP(), []int{1} +} + +// ReplicaStatus represents the status of a replica +type ReplicaStatus struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + State ModelReplicaState `protobuf:"varint,1,opt,name=state,proto3,enum=seldon.mlops.scheduler.db.ModelReplicaState" json:"state,omitempty"` + Reason string `protobuf:"bytes,2,opt,name=reason,proto3" json:"reason,omitempty"` + Timestamp *timestamppb.Timestamp `protobuf:"bytes,3,opt,name=timestamp,proto3" json:"timestamp,omitempty"` +} + +func (x *ReplicaStatus) Reset() { + *x = ReplicaStatus{} + if protoimpl.UnsafeEnabled { + mi := &file_mlops_scheduler_db_db_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ReplicaStatus) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ReplicaStatus) ProtoMessage() {} + +func (x *ReplicaStatus) ProtoReflect() protoreflect.Message { + mi := &file_mlops_scheduler_db_db_proto_msgTypes[0] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ReplicaStatus.ProtoReflect.Descriptor instead. +func (*ReplicaStatus) Descriptor() ([]byte, []int) { + return file_mlops_scheduler_db_db_proto_rawDescGZIP(), []int{0} +} + +func (x *ReplicaStatus) GetState() ModelReplicaState { + if x != nil { + return x.State + } + return ModelReplicaState_MODEL_REPLICA_STATE_UNKNOWN +} + +func (x *ReplicaStatus) GetReason() string { + if x != nil { + return x.Reason + } + return "" +} + +func (x *ReplicaStatus) GetTimestamp() *timestamppb.Timestamp { + if x != nil { + return x.Timestamp + } + return nil +} + +// ModelStatus represents the overall status of a model +type ModelStatus struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + State ModelState `protobuf:"varint,1,opt,name=state,proto3,enum=seldon.mlops.scheduler.db.ModelState" json:"state,omitempty"` + ModelGwState ModelState `protobuf:"varint,2,opt,name=model_gw_state,json=modelGwState,proto3,enum=seldon.mlops.scheduler.db.ModelState" json:"model_gw_state,omitempty"` + Reason string `protobuf:"bytes,3,opt,name=reason,proto3" json:"reason,omitempty"` + ModelGwReason string `protobuf:"bytes,4,opt,name=model_gw_reason,json=modelGwReason,proto3" json:"model_gw_reason,omitempty"` + AvailableReplicas uint32 `protobuf:"varint,5,opt,name=available_replicas,json=availableReplicas,proto3" json:"available_replicas,omitempty"` + UnavailableReplicas uint32 `protobuf:"varint,6,opt,name=unavailable_replicas,json=unavailableReplicas,proto3" json:"unavailable_replicas,omitempty"` + DrainingReplicas uint32 `protobuf:"varint,7,opt,name=draining_replicas,json=drainingReplicas,proto3" json:"draining_replicas,omitempty"` + Timestamp *timestamppb.Timestamp `protobuf:"bytes,8,opt,name=timestamp,proto3" json:"timestamp,omitempty"` +} + +func (x *ModelStatus) Reset() { + *x = ModelStatus{} + if protoimpl.UnsafeEnabled { + mi := &file_mlops_scheduler_db_db_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ModelStatus) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ModelStatus) ProtoMessage() {} + +func (x *ModelStatus) ProtoReflect() protoreflect.Message { + mi := &file_mlops_scheduler_db_db_proto_msgTypes[1] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ModelStatus.ProtoReflect.Descriptor instead. +func (*ModelStatus) Descriptor() ([]byte, []int) { + return file_mlops_scheduler_db_db_proto_rawDescGZIP(), []int{1} +} + +func (x *ModelStatus) GetState() ModelState { + if x != nil { + return x.State + } + return ModelState_MODEL_STATE_UNKNOWN +} + +func (x *ModelStatus) GetModelGwState() ModelState { + if x != nil { + return x.ModelGwState + } + return ModelState_MODEL_STATE_UNKNOWN +} + +func (x *ModelStatus) GetReason() string { + if x != nil { + return x.Reason + } + return "" +} + +func (x *ModelStatus) GetModelGwReason() string { + if x != nil { + return x.ModelGwReason + } + return "" +} + +func (x *ModelStatus) GetAvailableReplicas() uint32 { + if x != nil { + return x.AvailableReplicas + } + return 0 +} + +func (x *ModelStatus) GetUnavailableReplicas() uint32 { + if x != nil { + return x.UnavailableReplicas + } + return 0 +} + +func (x *ModelStatus) GetDrainingReplicas() uint32 { + if x != nil { + return x.DrainingReplicas + } + return 0 +} + +func (x *ModelStatus) GetTimestamp() *timestamppb.Timestamp { + if x != nil { + return x.Timestamp + } + return nil +} + +// ReplicaStatusEntry for map representation +type ReplicaStatusEntry struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + ReplicaIndex int32 `protobuf:"varint,1,opt,name=replica_index,json=replicaIndex,proto3" json:"replica_index,omitempty"` + Status *ReplicaStatus `protobuf:"bytes,2,opt,name=status,proto3" json:"status,omitempty"` +} + +func (x *ReplicaStatusEntry) Reset() { + *x = ReplicaStatusEntry{} + if protoimpl.UnsafeEnabled { + mi := &file_mlops_scheduler_db_db_proto_msgTypes[2] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ReplicaStatusEntry) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ReplicaStatusEntry) ProtoMessage() {} + +func (x *ReplicaStatusEntry) ProtoReflect() protoreflect.Message { + mi := &file_mlops_scheduler_db_db_proto_msgTypes[2] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ReplicaStatusEntry.ProtoReflect.Descriptor instead. +func (*ReplicaStatusEntry) Descriptor() ([]byte, []int) { + return file_mlops_scheduler_db_db_proto_rawDescGZIP(), []int{2} +} + +func (x *ReplicaStatusEntry) GetReplicaIndex() int32 { + if x != nil { + return x.ReplicaIndex + } + return 0 +} + +func (x *ReplicaStatusEntry) GetStatus() *ReplicaStatus { + if x != nil { + return x.Status + } + return nil +} + +// ModelVersion represents a version of a model +type ModelVersion struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + ModelDefn *scheduler.Model `protobuf:"bytes,1,opt,name=model_defn,json=modelDefn,proto3" json:"model_defn,omitempty"` // Assuming pb.Model is defined elsewhere + Version uint32 `protobuf:"varint,2,opt,name=version,proto3" json:"version,omitempty"` + Server string `protobuf:"bytes,3,opt,name=server,proto3" json:"server,omitempty"` + // Using repeated messages since proto doesn't support map directly + Replicas []*ReplicaStatusEntry `protobuf:"bytes,4,rep,name=replicas,proto3" json:"replicas,omitempty"` + State *ModelStatus `protobuf:"bytes,5,opt,name=state,proto3" json:"state,omitempty"` +} + +func (x *ModelVersion) Reset() { + *x = ModelVersion{} + if protoimpl.UnsafeEnabled { + mi := &file_mlops_scheduler_db_db_proto_msgTypes[3] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ModelVersion) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ModelVersion) ProtoMessage() {} + +func (x *ModelVersion) ProtoReflect() protoreflect.Message { + mi := &file_mlops_scheduler_db_db_proto_msgTypes[3] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ModelVersion.ProtoReflect.Descriptor instead. +func (*ModelVersion) Descriptor() ([]byte, []int) { + return file_mlops_scheduler_db_db_proto_rawDescGZIP(), []int{3} +} + +func (x *ModelVersion) GetModelDefn() *scheduler.Model { + if x != nil { + return x.ModelDefn + } + return nil +} + +func (x *ModelVersion) GetVersion() uint32 { + if x != nil { + return x.Version + } + return 0 +} + +func (x *ModelVersion) GetServer() string { + if x != nil { + return x.Server + } + return "" +} + +func (x *ModelVersion) GetReplicas() []*ReplicaStatusEntry { + if x != nil { + return x.Replicas + } + return nil +} + +func (x *ModelVersion) GetState() *ModelStatus { + if x != nil { + return x.State + } + return nil +} + +// ModelVersionID uniquely identifies a model version +type ModelVersionID struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` + Version uint32 `protobuf:"varint,2,opt,name=version,proto3" json:"version,omitempty"` +} + +func (x *ModelVersionID) Reset() { + *x = ModelVersionID{} + if protoimpl.UnsafeEnabled { + mi := &file_mlops_scheduler_db_db_proto_msgTypes[4] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ModelVersionID) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ModelVersionID) ProtoMessage() {} + +func (x *ModelVersionID) ProtoReflect() protoreflect.Message { + mi := &file_mlops_scheduler_db_db_proto_msgTypes[4] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ModelVersionID.ProtoReflect.Descriptor instead. +func (*ModelVersionID) Descriptor() ([]byte, []int) { + return file_mlops_scheduler_db_db_proto_rawDescGZIP(), []int{4} +} + +func (x *ModelVersionID) GetName() string { + if x != nil { + return x.Name + } + return "" +} + +func (x *ModelVersionID) GetVersion() uint32 { + if x != nil { + return x.Version + } + return 0 +} + +// Model represents a model with its versions +type Model struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` + Versions []*ModelVersion `protobuf:"bytes,2,rep,name=versions,proto3" json:"versions,omitempty"` + Deleted bool `protobuf:"varint,3,opt,name=deleted,proto3" json:"deleted,omitempty"` +} + +func (x *Model) Reset() { + *x = Model{} + if protoimpl.UnsafeEnabled { + mi := &file_mlops_scheduler_db_db_proto_msgTypes[5] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *Model) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*Model) ProtoMessage() {} + +func (x *Model) ProtoReflect() protoreflect.Message { + mi := &file_mlops_scheduler_db_db_proto_msgTypes[5] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use Model.ProtoReflect.Descriptor instead. +func (*Model) Descriptor() ([]byte, []int) { + return file_mlops_scheduler_db_db_proto_rawDescGZIP(), []int{5} +} + +func (x *Model) GetName() string { + if x != nil { + return x.Name + } + return "" +} + +func (x *Model) GetVersions() []*ModelVersion { + if x != nil { + return x.Versions + } + return nil +} + +func (x *Model) GetDeleted() bool { + if x != nil { + return x.Deleted + } + return false +} + +// Server represents a server with its configuration and replicas +type Server struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` + Replicas map[int32]*ServerReplica `protobuf:"bytes,2,rep,name=replicas,proto3" json:"replicas,omitempty" protobuf_key:"varint,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` + Shared bool `protobuf:"varint,3,opt,name=shared,proto3" json:"shared,omitempty"` + ExpectedReplicas int32 `protobuf:"varint,4,opt,name=expected_replicas,json=expectedReplicas,proto3" json:"expected_replicas,omitempty"` + MinReplicas int32 `protobuf:"varint,5,opt,name=min_replicas,json=minReplicas,proto3" json:"min_replicas,omitempty"` + MaxReplicas int32 `protobuf:"varint,6,opt,name=max_replicas,json=maxReplicas,proto3" json:"max_replicas,omitempty"` + KubernetesMeta *scheduler.KubernetesMeta `protobuf:"bytes,7,opt,name=kubernetes_meta,json=kubernetesMeta,proto3" json:"kubernetes_meta,omitempty"` +} + +func (x *Server) Reset() { + *x = Server{} + if protoimpl.UnsafeEnabled { + mi := &file_mlops_scheduler_db_db_proto_msgTypes[6] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *Server) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*Server) ProtoMessage() {} + +func (x *Server) ProtoReflect() protoreflect.Message { + mi := &file_mlops_scheduler_db_db_proto_msgTypes[6] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use Server.ProtoReflect.Descriptor instead. +func (*Server) Descriptor() ([]byte, []int) { + return file_mlops_scheduler_db_db_proto_rawDescGZIP(), []int{6} +} + +func (x *Server) GetName() string { + if x != nil { + return x.Name + } + return "" +} + +func (x *Server) GetReplicas() map[int32]*ServerReplica { + if x != nil { + return x.Replicas + } + return nil +} + +func (x *Server) GetShared() bool { + if x != nil { + return x.Shared + } + return false +} + +func (x *Server) GetExpectedReplicas() int32 { + if x != nil { + return x.ExpectedReplicas + } + return 0 +} + +func (x *Server) GetMinReplicas() int32 { + if x != nil { + return x.MinReplicas + } + return 0 +} + +func (x *Server) GetMaxReplicas() int32 { + if x != nil { + return x.MaxReplicas + } + return 0 +} + +func (x *Server) GetKubernetesMeta() *scheduler.KubernetesMeta { + if x != nil { + return x.KubernetesMeta + } + return nil +} + +// ModelLoadStatus represents a model's load status (for maps that use ModelVersionID as key) +type ModelLoadStatus struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + ModelVersionId *ModelVersionID `protobuf:"bytes,1,opt,name=model_version_id,json=modelVersionId,proto3" json:"model_version_id,omitempty"` + Loaded bool `protobuf:"varint,2,opt,name=loaded,proto3" json:"loaded,omitempty"` +} + +func (x *ModelLoadStatus) Reset() { + *x = ModelLoadStatus{} + if protoimpl.UnsafeEnabled { + mi := &file_mlops_scheduler_db_db_proto_msgTypes[7] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ModelLoadStatus) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ModelLoadStatus) ProtoMessage() {} + +func (x *ModelLoadStatus) ProtoReflect() protoreflect.Message { + mi := &file_mlops_scheduler_db_db_proto_msgTypes[7] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ModelLoadStatus.ProtoReflect.Descriptor instead. +func (*ModelLoadStatus) Descriptor() ([]byte, []int) { + return file_mlops_scheduler_db_db_proto_rawDescGZIP(), []int{7} +} + +func (x *ModelLoadStatus) GetModelVersionId() *ModelVersionID { + if x != nil { + return x.ModelVersionId + } + return nil +} + +func (x *ModelLoadStatus) GetLoaded() bool { + if x != nil { + return x.Loaded + } + return false +} + +// ServerReplica represents a single replica of a server +type ServerReplica struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + InferenceSvc string `protobuf:"bytes,1,opt,name=inference_svc,json=inferenceSvc,proto3" json:"inference_svc,omitempty"` + InferenceHttpPort int32 `protobuf:"varint,2,opt,name=inference_http_port,json=inferenceHttpPort,proto3" json:"inference_http_port,omitempty"` + InferenceGrpcPort int32 `protobuf:"varint,3,opt,name=inference_grpc_port,json=inferenceGrpcPort,proto3" json:"inference_grpc_port,omitempty"` + ServerName string `protobuf:"bytes,4,opt,name=server_name,json=serverName,proto3" json:"server_name,omitempty"` + ReplicaIdx int32 `protobuf:"varint,5,opt,name=replica_idx,json=replicaIdx,proto3" json:"replica_idx,omitempty"` + Capabilities []string `protobuf:"bytes,6,rep,name=capabilities,proto3" json:"capabilities,omitempty"` + Memory uint64 `protobuf:"varint,7,opt,name=memory,proto3" json:"memory,omitempty"` + AvailableMemory uint64 `protobuf:"varint,8,opt,name=available_memory,json=availableMemory,proto3" json:"available_memory,omitempty"` + // Using repeated messages since proto doesn't support complex map keys + LoadedModels []*ModelLoadStatus `protobuf:"bytes,9,rep,name=loaded_models,json=loadedModels,proto3" json:"loaded_models,omitempty"` + LoadingModels []*ModelLoadStatus `protobuf:"bytes,10,rep,name=loading_models,json=loadingModels,proto3" json:"loading_models,omitempty"` + OverCommitPercentage uint32 `protobuf:"varint,11,opt,name=over_commit_percentage,json=overCommitPercentage,proto3" json:"over_commit_percentage,omitempty"` + ReservedMemory uint64 `protobuf:"varint,12,opt,name=reserved_memory,json=reservedMemory,proto3" json:"reserved_memory,omitempty"` + UniqueLoadedModels map[string]bool `protobuf:"bytes,13,rep,name=unique_loaded_models,json=uniqueLoadedModels,proto3" json:"unique_loaded_models,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"varint,2,opt,name=value,proto3"` + IsDraining bool `protobuf:"varint,14,opt,name=is_draining,json=isDraining,proto3" json:"is_draining,omitempty"` +} + +func (x *ServerReplica) Reset() { + *x = ServerReplica{} + if protoimpl.UnsafeEnabled { + mi := &file_mlops_scheduler_db_db_proto_msgTypes[8] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ServerReplica) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ServerReplica) ProtoMessage() {} + +func (x *ServerReplica) ProtoReflect() protoreflect.Message { + mi := &file_mlops_scheduler_db_db_proto_msgTypes[8] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ServerReplica.ProtoReflect.Descriptor instead. +func (*ServerReplica) Descriptor() ([]byte, []int) { + return file_mlops_scheduler_db_db_proto_rawDescGZIP(), []int{8} +} + +func (x *ServerReplica) GetInferenceSvc() string { + if x != nil { + return x.InferenceSvc + } + return "" +} + +func (x *ServerReplica) GetInferenceHttpPort() int32 { + if x != nil { + return x.InferenceHttpPort + } + return 0 +} + +func (x *ServerReplica) GetInferenceGrpcPort() int32 { + if x != nil { + return x.InferenceGrpcPort + } + return 0 +} + +func (x *ServerReplica) GetServerName() string { + if x != nil { + return x.ServerName + } + return "" +} + +func (x *ServerReplica) GetReplicaIdx() int32 { + if x != nil { + return x.ReplicaIdx + } + return 0 +} + +func (x *ServerReplica) GetCapabilities() []string { + if x != nil { + return x.Capabilities + } + return nil +} + +func (x *ServerReplica) GetMemory() uint64 { + if x != nil { + return x.Memory + } + return 0 +} + +func (x *ServerReplica) GetAvailableMemory() uint64 { + if x != nil { + return x.AvailableMemory + } + return 0 +} + +func (x *ServerReplica) GetLoadedModels() []*ModelLoadStatus { + if x != nil { + return x.LoadedModels + } + return nil +} + +func (x *ServerReplica) GetLoadingModels() []*ModelLoadStatus { + if x != nil { + return x.LoadingModels + } + return nil +} + +func (x *ServerReplica) GetOverCommitPercentage() uint32 { + if x != nil { + return x.OverCommitPercentage + } + return 0 +} + +func (x *ServerReplica) GetReservedMemory() uint64 { + if x != nil { + return x.ReservedMemory + } + return 0 +} + +func (x *ServerReplica) GetUniqueLoadedModels() map[string]bool { + if x != nil { + return x.UniqueLoadedModels + } + return nil +} + +func (x *ServerReplica) GetIsDraining() bool { + if x != nil { + return x.IsDraining + } + return false +} + +var File_mlops_scheduler_db_db_proto protoreflect.FileDescriptor + +var file_mlops_scheduler_db_db_proto_rawDesc = []byte{ + 0x0a, 0x1b, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2f, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, + 0x72, 0x2f, 0x64, 0x62, 0x2f, 0x64, 0x62, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x19, 0x73, + 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, + 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, 0x62, 0x1a, 0x1f, 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, + 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, 0x66, 0x2f, 0x74, 0x69, 0x6d, 0x65, 0x73, 0x74, + 0x61, 0x6d, 0x70, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x1a, 0x1f, 0x6d, 0x6c, 0x6f, 0x70, 0x73, + 0x2f, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2f, 0x73, 0x63, 0x68, 0x65, 0x64, + 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x22, 0xa5, 0x01, 0x0a, 0x0d, 0x52, + 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x42, 0x0a, 0x05, + 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x2c, 0x2e, 0x73, 0x65, + 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, + 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, 0x62, 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x52, 0x65, 0x70, + 0x6c, 0x69, 0x63, 0x61, 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, + 0x12, 0x16, 0x0a, 0x06, 0x72, 0x65, 0x61, 0x73, 0x6f, 0x6e, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, + 0x52, 0x06, 0x72, 0x65, 0x61, 0x73, 0x6f, 0x6e, 0x12, 0x38, 0x0a, 0x09, 0x74, 0x69, 0x6d, 0x65, + 0x73, 0x74, 0x61, 0x6d, 0x70, 0x18, 0x03, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x6f, + 0x6f, 0x67, 0x6c, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, 0x66, 0x2e, 0x54, 0x69, + 0x6d, 0x65, 0x73, 0x74, 0x61, 0x6d, 0x70, 0x52, 0x09, 0x74, 0x69, 0x6d, 0x65, 0x73, 0x74, 0x61, + 0x6d, 0x70, 0x22, 0xa0, 0x03, 0x0a, 0x0b, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x53, 0x74, 0x61, 0x74, + 0x75, 0x73, 0x12, 0x3b, 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, + 0x0e, 0x32, 0x25, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, + 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, 0x62, 0x2e, 0x4d, 0x6f, + 0x64, 0x65, 0x6c, 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x12, + 0x4b, 0x0a, 0x0e, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x5f, 0x67, 0x77, 0x5f, 0x73, 0x74, 0x61, 0x74, + 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x25, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, + 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, + 0x2e, 0x64, 0x62, 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x53, 0x74, 0x61, 0x74, 0x65, 0x52, 0x0c, + 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x47, 0x77, 0x53, 0x74, 0x61, 0x74, 0x65, 0x12, 0x16, 0x0a, 0x06, + 0x72, 0x65, 0x61, 0x73, 0x6f, 0x6e, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x72, 0x65, + 0x61, 0x73, 0x6f, 0x6e, 0x12, 0x26, 0x0a, 0x0f, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x5f, 0x67, 0x77, + 0x5f, 0x72, 0x65, 0x61, 0x73, 0x6f, 0x6e, 0x18, 0x04, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0d, 0x6d, + 0x6f, 0x64, 0x65, 0x6c, 0x47, 0x77, 0x52, 0x65, 0x61, 0x73, 0x6f, 0x6e, 0x12, 0x2d, 0x0a, 0x12, + 0x61, 0x76, 0x61, 0x69, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, + 0x61, 0x73, 0x18, 0x05, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x11, 0x61, 0x76, 0x61, 0x69, 0x6c, 0x61, + 0x62, 0x6c, 0x65, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x12, 0x31, 0x0a, 0x14, 0x75, + 0x6e, 0x61, 0x76, 0x61, 0x69, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x72, 0x65, 0x70, 0x6c, 0x69, + 0x63, 0x61, 0x73, 0x18, 0x06, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x13, 0x75, 0x6e, 0x61, 0x76, 0x61, + 0x69, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x12, 0x2b, + 0x0a, 0x11, 0x64, 0x72, 0x61, 0x69, 0x6e, 0x69, 0x6e, 0x67, 0x5f, 0x72, 0x65, 0x70, 0x6c, 0x69, + 0x63, 0x61, 0x73, 0x18, 0x07, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x10, 0x64, 0x72, 0x61, 0x69, 0x6e, + 0x69, 0x6e, 0x67, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x12, 0x38, 0x0a, 0x09, 0x74, + 0x69, 0x6d, 0x65, 0x73, 0x74, 0x61, 0x6d, 0x70, 0x18, 0x08, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, + 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, 0x66, + 0x2e, 0x54, 0x69, 0x6d, 0x65, 0x73, 0x74, 0x61, 0x6d, 0x70, 0x52, 0x09, 0x74, 0x69, 0x6d, 0x65, + 0x73, 0x74, 0x61, 0x6d, 0x70, 0x22, 0x7b, 0x0a, 0x12, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, + 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x12, 0x23, 0x0a, 0x0d, 0x72, + 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x5f, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x18, 0x01, 0x20, 0x01, + 0x28, 0x05, 0x52, 0x0c, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x49, 0x6e, 0x64, 0x65, 0x78, + 0x12, 0x40, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, + 0x32, 0x28, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, + 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, 0x62, 0x2e, 0x52, 0x65, 0x70, + 0x6c, 0x69, 0x63, 0x61, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, + 0x75, 0x73, 0x22, 0x87, 0x02, 0x0a, 0x0c, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x56, 0x65, 0x72, 0x73, + 0x69, 0x6f, 0x6e, 0x12, 0x3c, 0x0a, 0x0a, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x5f, 0x64, 0x65, 0x66, + 0x6e, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1d, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, + 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, + 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x52, 0x09, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x44, 0x65, 0x66, + 0x6e, 0x12, 0x18, 0x0a, 0x07, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x18, 0x02, 0x20, 0x01, + 0x28, 0x0d, 0x52, 0x07, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x12, 0x16, 0x0a, 0x06, 0x73, + 0x65, 0x72, 0x76, 0x65, 0x72, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x73, 0x65, 0x72, + 0x76, 0x65, 0x72, 0x12, 0x49, 0x0a, 0x08, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x18, + 0x04, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x2d, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, + 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, + 0x62, 0x2e, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x45, + 0x6e, 0x74, 0x72, 0x79, 0x52, 0x08, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x12, 0x3c, + 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x05, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x26, 0x2e, + 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, + 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, 0x62, 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x53, + 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x22, 0x3e, 0x0a, 0x0e, + 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x56, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x49, 0x44, 0x12, 0x12, + 0x0a, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x6e, 0x61, + 0x6d, 0x65, 0x12, 0x18, 0x0a, 0x07, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x18, 0x02, 0x20, + 0x01, 0x28, 0x0d, 0x52, 0x07, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x22, 0x7a, 0x0a, 0x05, + 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x12, 0x12, 0x0a, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, + 0x01, 0x28, 0x09, 0x52, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x12, 0x43, 0x0a, 0x08, 0x76, 0x65, 0x72, + 0x73, 0x69, 0x6f, 0x6e, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x27, 0x2e, 0x73, 0x65, + 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, + 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, 0x62, 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x56, 0x65, 0x72, + 0x73, 0x69, 0x6f, 0x6e, 0x52, 0x08, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x73, 0x12, 0x18, + 0x0a, 0x07, 0x64, 0x65, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x18, 0x03, 0x20, 0x01, 0x28, 0x08, 0x52, + 0x07, 0x64, 0x65, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x22, 0xac, 0x03, 0x0a, 0x06, 0x53, 0x65, 0x72, + 0x76, 0x65, 0x72, 0x12, 0x12, 0x0a, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, + 0x09, 0x52, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x12, 0x4b, 0x0a, 0x08, 0x72, 0x65, 0x70, 0x6c, 0x69, + 0x63, 0x61, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x2f, 0x2e, 0x73, 0x65, 0x6c, 0x64, + 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, + 0x65, 0x72, 0x2e, 0x64, 0x62, 0x2e, 0x53, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x52, 0x65, 0x70, + 0x6c, 0x69, 0x63, 0x61, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, 0x08, 0x72, 0x65, 0x70, 0x6c, + 0x69, 0x63, 0x61, 0x73, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x18, 0x03, + 0x20, 0x01, 0x28, 0x08, 0x52, 0x06, 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x12, 0x2b, 0x0a, 0x11, + 0x65, 0x78, 0x70, 0x65, 0x63, 0x74, 0x65, 0x64, 0x5f, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, + 0x73, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 0x52, 0x10, 0x65, 0x78, 0x70, 0x65, 0x63, 0x74, 0x65, + 0x64, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x12, 0x21, 0x0a, 0x0c, 0x6d, 0x69, 0x6e, + 0x5f, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x18, 0x05, 0x20, 0x01, 0x28, 0x05, 0x52, + 0x0b, 0x6d, 0x69, 0x6e, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x12, 0x21, 0x0a, 0x0c, + 0x6d, 0x61, 0x78, 0x5f, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x18, 0x06, 0x20, 0x01, + 0x28, 0x05, 0x52, 0x0b, 0x6d, 0x61, 0x78, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x12, + 0x4f, 0x0a, 0x0f, 0x6b, 0x75, 0x62, 0x65, 0x72, 0x6e, 0x65, 0x74, 0x65, 0x73, 0x5f, 0x6d, 0x65, + 0x74, 0x61, 0x18, 0x07, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x26, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, + 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, + 0x72, 0x2e, 0x4b, 0x75, 0x62, 0x65, 0x72, 0x6e, 0x65, 0x74, 0x65, 0x73, 0x4d, 0x65, 0x74, 0x61, + 0x52, 0x0e, 0x6b, 0x75, 0x62, 0x65, 0x72, 0x6e, 0x65, 0x74, 0x65, 0x73, 0x4d, 0x65, 0x74, 0x61, + 0x1a, 0x65, 0x0a, 0x0d, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x45, 0x6e, 0x74, 0x72, + 0x79, 0x12, 0x10, 0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x03, + 0x6b, 0x65, 0x79, 0x12, 0x3e, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, + 0x28, 0x0b, 0x32, 0x28, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, + 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, 0x62, 0x2e, 0x53, + 0x65, 0x72, 0x76, 0x65, 0x72, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x52, 0x05, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x22, 0x7e, 0x0a, 0x0f, 0x4d, 0x6f, 0x64, 0x65, 0x6c, + 0x4c, 0x6f, 0x61, 0x64, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x53, 0x0a, 0x10, 0x6d, 0x6f, + 0x64, 0x65, 0x6c, 0x5f, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x5f, 0x69, 0x64, 0x18, 0x01, + 0x20, 0x01, 0x28, 0x0b, 0x32, 0x29, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, + 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, 0x62, + 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x56, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x49, 0x44, 0x52, + 0x0e, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x56, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x49, 0x64, 0x12, + 0x16, 0x0a, 0x06, 0x6c, 0x6f, 0x61, 0x64, 0x65, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x08, 0x52, + 0x06, 0x6c, 0x6f, 0x61, 0x64, 0x65, 0x64, 0x22, 0x9c, 0x06, 0x0a, 0x0d, 0x53, 0x65, 0x72, 0x76, + 0x65, 0x72, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x12, 0x23, 0x0a, 0x0d, 0x69, 0x6e, 0x66, + 0x65, 0x72, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x73, 0x76, 0x63, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, + 0x52, 0x0c, 0x69, 0x6e, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x63, 0x65, 0x53, 0x76, 0x63, 0x12, 0x2e, + 0x0a, 0x13, 0x69, 0x6e, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x68, 0x74, 0x74, 0x70, + 0x5f, 0x70, 0x6f, 0x72, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x05, 0x52, 0x11, 0x69, 0x6e, 0x66, + 0x65, 0x72, 0x65, 0x6e, 0x63, 0x65, 0x48, 0x74, 0x74, 0x70, 0x50, 0x6f, 0x72, 0x74, 0x12, 0x2e, + 0x0a, 0x13, 0x69, 0x6e, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x67, 0x72, 0x70, 0x63, + 0x5f, 0x70, 0x6f, 0x72, 0x74, 0x18, 0x03, 0x20, 0x01, 0x28, 0x05, 0x52, 0x11, 0x69, 0x6e, 0x66, + 0x65, 0x72, 0x65, 0x6e, 0x63, 0x65, 0x47, 0x72, 0x70, 0x63, 0x50, 0x6f, 0x72, 0x74, 0x12, 0x1f, + 0x0a, 0x0b, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x5f, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x04, 0x20, + 0x01, 0x28, 0x09, 0x52, 0x0a, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x4e, 0x61, 0x6d, 0x65, 0x12, + 0x1f, 0x0a, 0x0b, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x5f, 0x69, 0x64, 0x78, 0x18, 0x05, + 0x20, 0x01, 0x28, 0x05, 0x52, 0x0a, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x49, 0x64, 0x78, + 0x12, 0x22, 0x0a, 0x0c, 0x63, 0x61, 0x70, 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, + 0x18, 0x06, 0x20, 0x03, 0x28, 0x09, 0x52, 0x0c, 0x63, 0x61, 0x70, 0x61, 0x62, 0x69, 0x6c, 0x69, + 0x74, 0x69, 0x65, 0x73, 0x12, 0x16, 0x0a, 0x06, 0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x18, 0x07, + 0x20, 0x01, 0x28, 0x04, 0x52, 0x06, 0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x12, 0x29, 0x0a, 0x10, + 0x61, 0x76, 0x61, 0x69, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79, + 0x18, 0x08, 0x20, 0x01, 0x28, 0x04, 0x52, 0x0f, 0x61, 0x76, 0x61, 0x69, 0x6c, 0x61, 0x62, 0x6c, + 0x65, 0x4d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x12, 0x4f, 0x0a, 0x0d, 0x6c, 0x6f, 0x61, 0x64, 0x65, + 0x64, 0x5f, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x73, 0x18, 0x09, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x2a, + 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, + 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, 0x62, 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, + 0x4c, 0x6f, 0x61, 0x64, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x0c, 0x6c, 0x6f, 0x61, 0x64, + 0x65, 0x64, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x73, 0x12, 0x51, 0x0a, 0x0e, 0x6c, 0x6f, 0x61, 0x64, + 0x69, 0x6e, 0x67, 0x5f, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x73, 0x18, 0x0a, 0x20, 0x03, 0x28, 0x0b, + 0x32, 0x2a, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, + 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, 0x62, 0x2e, 0x4d, 0x6f, 0x64, + 0x65, 0x6c, 0x4c, 0x6f, 0x61, 0x64, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x0d, 0x6c, 0x6f, + 0x61, 0x64, 0x69, 0x6e, 0x67, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x73, 0x12, 0x34, 0x0a, 0x16, 0x6f, + 0x76, 0x65, 0x72, 0x5f, 0x63, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x5f, 0x70, 0x65, 0x72, 0x63, 0x65, + 0x6e, 0x74, 0x61, 0x67, 0x65, 0x18, 0x0b, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x14, 0x6f, 0x76, 0x65, + 0x72, 0x43, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x50, 0x65, 0x72, 0x63, 0x65, 0x6e, 0x74, 0x61, 0x67, + 0x65, 0x12, 0x27, 0x0a, 0x0f, 0x72, 0x65, 0x73, 0x65, 0x72, 0x76, 0x65, 0x64, 0x5f, 0x6d, 0x65, + 0x6d, 0x6f, 0x72, 0x79, 0x18, 0x0c, 0x20, 0x01, 0x28, 0x04, 0x52, 0x0e, 0x72, 0x65, 0x73, 0x65, + 0x72, 0x76, 0x65, 0x64, 0x4d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x12, 0x72, 0x0a, 0x14, 0x75, 0x6e, + 0x69, 0x71, 0x75, 0x65, 0x5f, 0x6c, 0x6f, 0x61, 0x64, 0x65, 0x64, 0x5f, 0x6d, 0x6f, 0x64, 0x65, + 0x6c, 0x73, 0x18, 0x0d, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x40, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, + 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, + 0x72, 0x2e, 0x64, 0x62, 0x2e, 0x53, 0x65, 0x72, 0x76, 0x65, 0x72, 0x52, 0x65, 0x70, 0x6c, 0x69, + 0x63, 0x61, 0x2e, 0x55, 0x6e, 0x69, 0x71, 0x75, 0x65, 0x4c, 0x6f, 0x61, 0x64, 0x65, 0x64, 0x4d, + 0x6f, 0x64, 0x65, 0x6c, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, 0x12, 0x75, 0x6e, 0x69, 0x71, + 0x75, 0x65, 0x4c, 0x6f, 0x61, 0x64, 0x65, 0x64, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x73, 0x12, 0x1f, + 0x0a, 0x0b, 0x69, 0x73, 0x5f, 0x64, 0x72, 0x61, 0x69, 0x6e, 0x69, 0x6e, 0x67, 0x18, 0x0e, 0x20, + 0x01, 0x28, 0x08, 0x52, 0x0a, 0x69, 0x73, 0x44, 0x72, 0x61, 0x69, 0x6e, 0x69, 0x6e, 0x67, 0x1a, + 0x45, 0x0a, 0x17, 0x55, 0x6e, 0x69, 0x71, 0x75, 0x65, 0x4c, 0x6f, 0x61, 0x64, 0x65, 0x64, 0x4d, + 0x6f, 0x64, 0x65, 0x6c, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x12, 0x10, 0x0a, 0x03, 0x6b, 0x65, + 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x6b, 0x65, 0x79, 0x12, 0x14, 0x0a, 0x05, + 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x08, 0x52, 0x05, 0x76, 0x61, 0x6c, + 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x2a, 0xc1, 0x02, 0x0a, 0x0a, 0x4d, 0x6f, 0x64, 0x65, 0x6c, + 0x53, 0x74, 0x61, 0x74, 0x65, 0x12, 0x17, 0x0a, 0x13, 0x4d, 0x4f, 0x44, 0x45, 0x4c, 0x5f, 0x53, + 0x54, 0x41, 0x54, 0x45, 0x5f, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x00, 0x12, 0x1b, + 0x0a, 0x17, 0x4d, 0x4f, 0x44, 0x45, 0x4c, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x50, 0x52, + 0x4f, 0x47, 0x52, 0x45, 0x53, 0x53, 0x49, 0x4e, 0x47, 0x10, 0x01, 0x12, 0x19, 0x0a, 0x15, 0x4d, + 0x4f, 0x44, 0x45, 0x4c, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x41, 0x56, 0x41, 0x49, 0x4c, + 0x41, 0x42, 0x4c, 0x45, 0x10, 0x02, 0x12, 0x16, 0x0a, 0x12, 0x4d, 0x4f, 0x44, 0x45, 0x4c, 0x5f, + 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x46, 0x41, 0x49, 0x4c, 0x45, 0x44, 0x10, 0x03, 0x12, 0x1b, + 0x0a, 0x17, 0x4d, 0x4f, 0x44, 0x45, 0x4c, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x54, 0x45, + 0x52, 0x4d, 0x49, 0x4e, 0x41, 0x54, 0x49, 0x4e, 0x47, 0x10, 0x04, 0x12, 0x1a, 0x0a, 0x16, 0x4d, + 0x4f, 0x44, 0x45, 0x4c, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x54, 0x45, 0x52, 0x4d, 0x49, + 0x4e, 0x41, 0x54, 0x45, 0x44, 0x10, 0x05, 0x12, 0x20, 0x0a, 0x1c, 0x4d, 0x4f, 0x44, 0x45, 0x4c, + 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x54, 0x45, 0x52, 0x4d, 0x49, 0x4e, 0x41, 0x54, 0x45, + 0x5f, 0x46, 0x41, 0x49, 0x4c, 0x45, 0x44, 0x10, 0x06, 0x12, 0x1f, 0x0a, 0x1b, 0x4d, 0x4f, 0x44, + 0x45, 0x4c, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x53, 0x43, 0x48, 0x45, 0x44, 0x55, 0x4c, + 0x45, 0x5f, 0x46, 0x41, 0x49, 0x4c, 0x45, 0x44, 0x10, 0x07, 0x12, 0x1b, 0x0a, 0x17, 0x4d, 0x4f, + 0x44, 0x45, 0x4c, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x53, 0x43, 0x41, 0x4c, 0x45, 0x44, + 0x5f, 0x44, 0x4f, 0x57, 0x4e, 0x10, 0x08, 0x12, 0x16, 0x0a, 0x12, 0x4d, 0x4f, 0x44, 0x45, 0x4c, + 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x43, 0x52, 0x45, 0x41, 0x54, 0x45, 0x10, 0x09, 0x12, + 0x19, 0x0a, 0x15, 0x4d, 0x4f, 0x44, 0x45, 0x4c, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x54, + 0x45, 0x52, 0x4d, 0x49, 0x4e, 0x41, 0x54, 0x45, 0x10, 0x0a, 0x2a, 0xf9, 0x03, 0x0a, 0x11, 0x4d, + 0x6f, 0x64, 0x65, 0x6c, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x53, 0x74, 0x61, 0x74, 0x65, + 0x12, 0x1f, 0x0a, 0x1b, 0x4d, 0x4f, 0x44, 0x45, 0x4c, 0x5f, 0x52, 0x45, 0x50, 0x4c, 0x49, 0x43, + 0x41, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, + 0x00, 0x12, 0x26, 0x0a, 0x22, 0x4d, 0x4f, 0x44, 0x45, 0x4c, 0x5f, 0x52, 0x45, 0x50, 0x4c, 0x49, + 0x43, 0x41, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x4c, 0x4f, 0x41, 0x44, 0x5f, 0x52, 0x45, + 0x51, 0x55, 0x45, 0x53, 0x54, 0x45, 0x44, 0x10, 0x01, 0x12, 0x1f, 0x0a, 0x1b, 0x4d, 0x4f, 0x44, + 0x45, 0x4c, 0x5f, 0x52, 0x45, 0x50, 0x4c, 0x49, 0x43, 0x41, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, + 0x5f, 0x4c, 0x4f, 0x41, 0x44, 0x49, 0x4e, 0x47, 0x10, 0x02, 0x12, 0x1e, 0x0a, 0x1a, 0x4d, 0x4f, + 0x44, 0x45, 0x4c, 0x5f, 0x52, 0x45, 0x50, 0x4c, 0x49, 0x43, 0x41, 0x5f, 0x53, 0x54, 0x41, 0x54, + 0x45, 0x5f, 0x4c, 0x4f, 0x41, 0x44, 0x45, 0x44, 0x10, 0x03, 0x12, 0x23, 0x0a, 0x1f, 0x4d, 0x4f, + 0x44, 0x45, 0x4c, 0x5f, 0x52, 0x45, 0x50, 0x4c, 0x49, 0x43, 0x41, 0x5f, 0x53, 0x54, 0x41, 0x54, + 0x45, 0x5f, 0x4c, 0x4f, 0x41, 0x44, 0x5f, 0x46, 0x41, 0x49, 0x4c, 0x45, 0x44, 0x10, 0x04, 0x12, + 0x2e, 0x0a, 0x2a, 0x4d, 0x4f, 0x44, 0x45, 0x4c, 0x5f, 0x52, 0x45, 0x50, 0x4c, 0x49, 0x43, 0x41, + 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x55, 0x4e, 0x4c, 0x4f, 0x41, 0x44, 0x5f, 0x45, 0x4e, + 0x56, 0x4f, 0x59, 0x5f, 0x52, 0x45, 0x51, 0x55, 0x45, 0x53, 0x54, 0x45, 0x44, 0x10, 0x05, 0x12, + 0x28, 0x0a, 0x24, 0x4d, 0x4f, 0x44, 0x45, 0x4c, 0x5f, 0x52, 0x45, 0x50, 0x4c, 0x49, 0x43, 0x41, + 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x55, 0x4e, 0x4c, 0x4f, 0x41, 0x44, 0x5f, 0x52, 0x45, + 0x51, 0x55, 0x45, 0x53, 0x54, 0x45, 0x44, 0x10, 0x06, 0x12, 0x21, 0x0a, 0x1d, 0x4d, 0x4f, 0x44, + 0x45, 0x4c, 0x5f, 0x52, 0x45, 0x50, 0x4c, 0x49, 0x43, 0x41, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, + 0x5f, 0x55, 0x4e, 0x4c, 0x4f, 0x41, 0x44, 0x49, 0x4e, 0x47, 0x10, 0x07, 0x12, 0x20, 0x0a, 0x1c, + 0x4d, 0x4f, 0x44, 0x45, 0x4c, 0x5f, 0x52, 0x45, 0x50, 0x4c, 0x49, 0x43, 0x41, 0x5f, 0x53, 0x54, + 0x41, 0x54, 0x45, 0x5f, 0x55, 0x4e, 0x4c, 0x4f, 0x41, 0x44, 0x45, 0x44, 0x10, 0x08, 0x12, 0x25, + 0x0a, 0x21, 0x4d, 0x4f, 0x44, 0x45, 0x4c, 0x5f, 0x52, 0x45, 0x50, 0x4c, 0x49, 0x43, 0x41, 0x5f, + 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x55, 0x4e, 0x4c, 0x4f, 0x41, 0x44, 0x5f, 0x46, 0x41, 0x49, + 0x4c, 0x45, 0x44, 0x10, 0x09, 0x12, 0x21, 0x0a, 0x1d, 0x4d, 0x4f, 0x44, 0x45, 0x4c, 0x5f, 0x52, + 0x45, 0x50, 0x4c, 0x49, 0x43, 0x41, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x41, 0x56, 0x41, + 0x49, 0x4c, 0x41, 0x42, 0x4c, 0x45, 0x10, 0x0a, 0x12, 0x2a, 0x0a, 0x26, 0x4d, 0x4f, 0x44, 0x45, + 0x4c, 0x5f, 0x52, 0x45, 0x50, 0x4c, 0x49, 0x43, 0x41, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, + 0x4c, 0x4f, 0x41, 0x44, 0x45, 0x44, 0x5f, 0x55, 0x4e, 0x41, 0x56, 0x41, 0x49, 0x4c, 0x41, 0x42, + 0x4c, 0x45, 0x10, 0x0b, 0x12, 0x20, 0x0a, 0x1c, 0x4d, 0x4f, 0x44, 0x45, 0x4c, 0x5f, 0x52, 0x45, + 0x50, 0x4c, 0x49, 0x43, 0x41, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x44, 0x52, 0x41, 0x49, + 0x4e, 0x49, 0x4e, 0x47, 0x10, 0x0c, 0x42, 0x3f, 0x5a, 0x3d, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, + 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x69, 0x6f, 0x2f, 0x73, 0x65, + 0x6c, 0x64, 0x6f, 0x6e, 0x2d, 0x63, 0x6f, 0x72, 0x65, 0x2f, 0x61, 0x70, 0x69, 0x73, 0x2f, 0x67, + 0x6f, 0x2f, 0x76, 0x32, 0x2f, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2f, 0x73, 0x63, 0x68, 0x65, 0x64, + 0x75, 0x6c, 0x65, 0x72, 0x2f, 0x64, 0x62, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, +} + +var ( + file_mlops_scheduler_db_db_proto_rawDescOnce sync.Once + file_mlops_scheduler_db_db_proto_rawDescData = file_mlops_scheduler_db_db_proto_rawDesc +) + +func file_mlops_scheduler_db_db_proto_rawDescGZIP() []byte { + file_mlops_scheduler_db_db_proto_rawDescOnce.Do(func() { + file_mlops_scheduler_db_db_proto_rawDescData = protoimpl.X.CompressGZIP(file_mlops_scheduler_db_db_proto_rawDescData) + }) + return file_mlops_scheduler_db_db_proto_rawDescData +} + +var file_mlops_scheduler_db_db_proto_enumTypes = make([]protoimpl.EnumInfo, 2) +var file_mlops_scheduler_db_db_proto_msgTypes = make([]protoimpl.MessageInfo, 11) +var file_mlops_scheduler_db_db_proto_goTypes = []any{ + (ModelState)(0), // 0: seldon.mlops.scheduler.db.ModelState + (ModelReplicaState)(0), // 1: seldon.mlops.scheduler.db.ModelReplicaState + (*ReplicaStatus)(nil), // 2: seldon.mlops.scheduler.db.ReplicaStatus + (*ModelStatus)(nil), // 3: seldon.mlops.scheduler.db.ModelStatus + (*ReplicaStatusEntry)(nil), // 4: seldon.mlops.scheduler.db.ReplicaStatusEntry + (*ModelVersion)(nil), // 5: seldon.mlops.scheduler.db.ModelVersion + (*ModelVersionID)(nil), // 6: seldon.mlops.scheduler.db.ModelVersionID + (*Model)(nil), // 7: seldon.mlops.scheduler.db.Model + (*Server)(nil), // 8: seldon.mlops.scheduler.db.Server + (*ModelLoadStatus)(nil), // 9: seldon.mlops.scheduler.db.ModelLoadStatus + (*ServerReplica)(nil), // 10: seldon.mlops.scheduler.db.ServerReplica + nil, // 11: seldon.mlops.scheduler.db.Server.ReplicasEntry + nil, // 12: seldon.mlops.scheduler.db.ServerReplica.UniqueLoadedModelsEntry + (*timestamppb.Timestamp)(nil), // 13: google.protobuf.Timestamp + (*scheduler.Model)(nil), // 14: seldon.mlops.scheduler.Model + (*scheduler.KubernetesMeta)(nil), // 15: seldon.mlops.scheduler.KubernetesMeta +} +var file_mlops_scheduler_db_db_proto_depIdxs = []int32{ + 1, // 0: seldon.mlops.scheduler.db.ReplicaStatus.state:type_name -> seldon.mlops.scheduler.db.ModelReplicaState + 13, // 1: seldon.mlops.scheduler.db.ReplicaStatus.timestamp:type_name -> google.protobuf.Timestamp + 0, // 2: seldon.mlops.scheduler.db.ModelStatus.state:type_name -> seldon.mlops.scheduler.db.ModelState + 0, // 3: seldon.mlops.scheduler.db.ModelStatus.model_gw_state:type_name -> seldon.mlops.scheduler.db.ModelState + 13, // 4: seldon.mlops.scheduler.db.ModelStatus.timestamp:type_name -> google.protobuf.Timestamp + 2, // 5: seldon.mlops.scheduler.db.ReplicaStatusEntry.status:type_name -> seldon.mlops.scheduler.db.ReplicaStatus + 14, // 6: seldon.mlops.scheduler.db.ModelVersion.model_defn:type_name -> seldon.mlops.scheduler.Model + 4, // 7: seldon.mlops.scheduler.db.ModelVersion.replicas:type_name -> seldon.mlops.scheduler.db.ReplicaStatusEntry + 3, // 8: seldon.mlops.scheduler.db.ModelVersion.state:type_name -> seldon.mlops.scheduler.db.ModelStatus + 5, // 9: seldon.mlops.scheduler.db.Model.versions:type_name -> seldon.mlops.scheduler.db.ModelVersion + 11, // 10: seldon.mlops.scheduler.db.Server.replicas:type_name -> seldon.mlops.scheduler.db.Server.ReplicasEntry + 15, // 11: seldon.mlops.scheduler.db.Server.kubernetes_meta:type_name -> seldon.mlops.scheduler.KubernetesMeta + 6, // 12: seldon.mlops.scheduler.db.ModelLoadStatus.model_version_id:type_name -> seldon.mlops.scheduler.db.ModelVersionID + 9, // 13: seldon.mlops.scheduler.db.ServerReplica.loaded_models:type_name -> seldon.mlops.scheduler.db.ModelLoadStatus + 9, // 14: seldon.mlops.scheduler.db.ServerReplica.loading_models:type_name -> seldon.mlops.scheduler.db.ModelLoadStatus + 12, // 15: seldon.mlops.scheduler.db.ServerReplica.unique_loaded_models:type_name -> seldon.mlops.scheduler.db.ServerReplica.UniqueLoadedModelsEntry + 10, // 16: seldon.mlops.scheduler.db.Server.ReplicasEntry.value:type_name -> seldon.mlops.scheduler.db.ServerReplica + 17, // [17:17] is the sub-list for method output_type + 17, // [17:17] is the sub-list for method input_type + 17, // [17:17] is the sub-list for extension type_name + 17, // [17:17] is the sub-list for extension extendee + 0, // [0:17] is the sub-list for field type_name +} + +func init() { file_mlops_scheduler_db_db_proto_init() } +func file_mlops_scheduler_db_db_proto_init() { + if File_mlops_scheduler_db_db_proto != nil { + return + } + if !protoimpl.UnsafeEnabled { + file_mlops_scheduler_db_db_proto_msgTypes[0].Exporter = func(v any, i int) any { + switch v := v.(*ReplicaStatus); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_mlops_scheduler_db_db_proto_msgTypes[1].Exporter = func(v any, i int) any { + switch v := v.(*ModelStatus); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_mlops_scheduler_db_db_proto_msgTypes[2].Exporter = func(v any, i int) any { + switch v := v.(*ReplicaStatusEntry); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_mlops_scheduler_db_db_proto_msgTypes[3].Exporter = func(v any, i int) any { + switch v := v.(*ModelVersion); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_mlops_scheduler_db_db_proto_msgTypes[4].Exporter = func(v any, i int) any { + switch v := v.(*ModelVersionID); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_mlops_scheduler_db_db_proto_msgTypes[5].Exporter = func(v any, i int) any { + switch v := v.(*Model); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_mlops_scheduler_db_db_proto_msgTypes[6].Exporter = func(v any, i int) any { + switch v := v.(*Server); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_mlops_scheduler_db_db_proto_msgTypes[7].Exporter = func(v any, i int) any { + switch v := v.(*ModelLoadStatus); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_mlops_scheduler_db_db_proto_msgTypes[8].Exporter = func(v any, i int) any { + switch v := v.(*ServerReplica); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: file_mlops_scheduler_db_db_proto_rawDesc, + NumEnums: 2, + NumMessages: 11, + NumExtensions: 0, + NumServices: 0, + }, + GoTypes: file_mlops_scheduler_db_db_proto_goTypes, + DependencyIndexes: file_mlops_scheduler_db_db_proto_depIdxs, + EnumInfos: file_mlops_scheduler_db_db_proto_enumTypes, + MessageInfos: file_mlops_scheduler_db_db_proto_msgTypes, + }.Build() + File_mlops_scheduler_db_db_proto = out.File + file_mlops_scheduler_db_db_proto_rawDesc = nil + file_mlops_scheduler_db_db_proto_goTypes = nil + file_mlops_scheduler_db_db_proto_depIdxs = nil +} diff --git a/apis/mlops/scheduler/db/db.proto b/apis/mlops/scheduler/db/db.proto new file mode 100644 index 0000000000..2830bc414f --- /dev/null +++ b/apis/mlops/scheduler/db/db.proto @@ -0,0 +1,125 @@ +syntax = "proto3"; + +package seldon.mlops.scheduler.db; + +option go_package = "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db"; + +import "google/protobuf/timestamp.proto"; +import "mlops/scheduler/scheduler.proto"; + +// ModelState represents the state of a model +enum ModelState { + MODEL_STATE_UNKNOWN = 0; + MODEL_STATE_PROGRESSING = 1; + MODEL_STATE_AVAILABLE = 2; + MODEL_STATE_FAILED = 3; + MODEL_STATE_TERMINATING = 4; + MODEL_STATE_TERMINATED = 5; + MODEL_STATE_TERMINATE_FAILED = 6; + MODEL_STATE_SCHEDULE_FAILED = 7; + MODEL_STATE_SCALED_DOWN = 8; + MODEL_STATE_CREATE = 9; + MODEL_STATE_TERMINATE = 10; +} + +// ModelReplicaState represents the state of a model replica +enum ModelReplicaState { + MODEL_REPLICA_STATE_UNKNOWN = 0; + MODEL_REPLICA_STATE_LOAD_REQUESTED = 1; + MODEL_REPLICA_STATE_LOADING = 2; + MODEL_REPLICA_STATE_LOADED = 3; + MODEL_REPLICA_STATE_LOAD_FAILED = 4; + MODEL_REPLICA_STATE_UNLOAD_ENVOY_REQUESTED = 5; + MODEL_REPLICA_STATE_UNLOAD_REQUESTED = 6; + MODEL_REPLICA_STATE_UNLOADING = 7; + MODEL_REPLICA_STATE_UNLOADED = 8; + MODEL_REPLICA_STATE_UNLOAD_FAILED = 9; + MODEL_REPLICA_STATE_AVAILABLE = 10; + MODEL_REPLICA_STATE_LOADED_UNAVAILABLE = 11; + MODEL_REPLICA_STATE_DRAINING = 12; +} + +// ReplicaStatus represents the status of a replica +message ReplicaStatus { + ModelReplicaState state = 1; + string reason = 2; + google.protobuf.Timestamp timestamp = 3; +} + +// ModelStatus represents the overall status of a model +message ModelStatus { + ModelState state = 1; + ModelState model_gw_state = 2; + string reason = 3; + string model_gw_reason = 4; + uint32 available_replicas = 5; + uint32 unavailable_replicas = 6; + uint32 draining_replicas = 7; + google.protobuf.Timestamp timestamp = 8; +} + +// ReplicaStatusEntry for map representation +message ReplicaStatusEntry { + int32 replica_index = 1; + ReplicaStatus status = 2; +} + +// ModelVersion represents a version of a model +message ModelVersion { + seldon.mlops.scheduler.Model model_defn = 1; // Assuming pb.Model is defined elsewhere + uint32 version = 2; + string server = 3; + // Using repeated messages since proto doesn't support map directly + repeated ReplicaStatusEntry replicas = 4; + ModelStatus state = 5; +} + + +// ModelVersionID uniquely identifies a model version +message ModelVersionID { + string name = 1; + uint32 version = 2; +} + +// Model represents a model with its versions +message Model { + string name = 1; + repeated ModelVersion versions = 2; + bool deleted = 3; +} + +// Server represents a server with its configuration and replicas +message Server { + string name = 1; + map replicas = 2; + bool shared = 3; + int32 expected_replicas = 4; + int32 min_replicas = 5; + int32 max_replicas = 6; + seldon.mlops.scheduler.KubernetesMeta kubernetes_meta = 7; +} + +// ModelLoadStatus represents a model's load status (for maps that use ModelVersionID as key) +message ModelLoadStatus { + ModelVersionID model_version_id = 1; + bool loaded = 2; +} + +// ServerReplica represents a single replica of a server +message ServerReplica { + string inference_svc = 1; + int32 inference_http_port = 2; + int32 inference_grpc_port = 3; + string server_name = 4; + int32 replica_idx = 5; + repeated string capabilities = 6; + uint64 memory = 7; + uint64 available_memory = 8; + // Using repeated messages since proto doesn't support complex map keys + repeated ModelLoadStatus loaded_models = 9; + repeated ModelLoadStatus loading_models = 10; + uint32 over_commit_percentage = 11; + uint64 reserved_memory = 12; + map unique_loaded_models = 13; + bool is_draining = 14; +} diff --git a/apis/mlops/scheduler/storage.proto b/apis/mlops/scheduler/storage.proto index f262a4c055..69784b3c1c 100644 --- a/apis/mlops/scheduler/storage.proto +++ b/apis/mlops/scheduler/storage.proto @@ -20,3 +20,4 @@ message ExperimentSnapshot { // on restart, which would guard against lost events in communication. bool deleted = 2; } + diff --git a/scheduler/pkg/bridge/event-data-store-bridge.go b/scheduler/pkg/bridge/event-data-store-bridge.go deleted file mode 100644 index df462a561f..0000000000 --- a/scheduler/pkg/bridge/event-data-store-bridge.go +++ /dev/null @@ -1,7 +0,0 @@ -package bridge - -import "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" - -type EventDataStoreBridge interface { - AddServerReplica(models []coordinator.ModelEventMsg, msg coordinator.ServerEventMsg) error -} diff --git a/scheduler/pkg/store/memory.go b/scheduler/pkg/store/memory.go index 4a6b26823a..e6a1071795 100644 --- a/scheduler/pkg/store/memory.go +++ b/scheduler/pkg/store/memory.go @@ -15,7 +15,6 @@ import ( "sort" "sync" - "github.com/seldonio/seldon-core/scheduler/v2/pkg/bridge" log "github.com/sirupsen/logrus" "github.com/seldonio/seldon-core/apis/go/v2/mlops/agent" @@ -31,9 +30,10 @@ type MemoryStore struct { store Storage logger log.FieldLogger eventHub *coordinator.EventHub - bridge bridge.EventDataStoreBridge } +var _ ModelStore = &MemoryStore{} + func NewMemoryStore( logger log.FieldLogger, store Storage, diff --git a/scheduler/pkg/store/mesh.go b/scheduler/pkg/store/mesh.go index ac0360a5c3..524400bc8d 100644 --- a/scheduler/pkg/store/mesh.go +++ b/scheduler/pkg/store/mesh.go @@ -172,7 +172,6 @@ type ServerReplica struct { inferenceGrpcPort int32 serverName string replicaIdx int - server *Server capabilities []string memory uint64 availableMemory uint64 @@ -206,7 +205,6 @@ func NewServerReplica(inferenceSvc string, inferenceGrpcPort: inferenceGrpcPort, serverName: server.name, replicaIdx: replicaIdx, - server: server, capabilities: cleanCapabilities(capabilities), memory: memory, availableMemory: availableMemory, @@ -226,7 +224,6 @@ func NewServerReplicaFromConfig(server *Server, replicaIdx int, loadedModels map inferenceGrpcPort: config.GetInferenceGrpcPort(), serverName: server.name, replicaIdx: replicaIdx, - server: server, capabilities: cleanCapabilities(config.GetCapabilities()), memory: config.GetMemoryBytes(), availableMemory: availableMemoryBytes, @@ -687,7 +684,6 @@ func (s *ServerReplica) createSnapshot(modelDetails bool) *ServerReplica { inferenceGrpcPort: s.inferenceGrpcPort, serverName: s.serverName, replicaIdx: s.replicaIdx, - server: nil, capabilities: capabilities, memory: s.memory, availableMemory: s.availableMemory, diff --git a/scheduler/pkg/store/mock/store.go b/scheduler/pkg/store/mock/store.go index b61b312c26..d6091f1c84 100644 --- a/scheduler/pkg/store/mock/store.go +++ b/scheduler/pkg/store/mock/store.go @@ -79,6 +79,20 @@ func (mr *MockModelStoreMockRecorder) DrainServerReplica(serverName, replicaIdx return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "DrainServerReplica", reflect.TypeOf((*MockModelStore)(nil).DrainServerReplica), serverName, replicaIdx) } +// EmitEvents mocks base method. +func (m *MockModelStore) EmitEvents() error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "EmitEvents") + ret0, _ := ret[0].(error) + return ret0 +} + +// EmitEvents indicates an expected call of EmitEvents. +func (mr *MockModelStoreMockRecorder) EmitEvents() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "EmitEvents", reflect.TypeOf((*MockModelStore)(nil).EmitEvents)) +} + // FailedScheduling mocks base method. func (m *MockModelStore) FailedScheduling(modelID string, version uint32, reason string, reset bool) error { m.ctrl.T.Helper() @@ -94,11 +108,12 @@ func (mr *MockModelStoreMockRecorder) FailedScheduling(modelID, version, reason, } // GetAllModels mocks base method. -func (m *MockModelStore) GetAllModels() []string { +func (m *MockModelStore) GetAllModels() ([]string, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "GetAllModels") ret0, _ := ret[0].([]string) - return ret0 + ret1, _ := ret[1].(error) + return ret0, ret1 } // GetAllModels indicates an expected call of GetAllModels. diff --git a/scheduler/pkg/store/storage.go b/scheduler/pkg/store/storage.go index eadc3f7f2f..4dba69ed2e 100644 --- a/scheduler/pkg/store/storage.go +++ b/scheduler/pkg/store/storage.go @@ -1,3 +1,12 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed BY +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + package store import "errors" diff --git a/scheduler/pkg/store/store.go b/scheduler/pkg/store/store.go index 3552ed2335..48c5574bb9 100644 --- a/scheduler/pkg/store/store.go +++ b/scheduler/pkg/store/store.go @@ -154,7 +154,7 @@ type ModelStore interface { RemoveServerReplica(serverName string, replicaIdx int) ([]string, error) // return previously loaded models DrainServerReplica(serverName string, replicaIdx int) ([]string, error) // return previously loaded models FailedScheduling(modelID string, version uint32, reason string, reset bool) error - GetAllModels() []string + GetAllModels() ([]string, error) SetModelGwModelState(name string, versionNumber uint32, status ModelState, reason string, source string) error EmitEvents() error } From e282f42bd841449e036b4aeb3003ab2702c96e53 Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Mon, 29 Dec 2025 19:10:41 +0000 Subject: [PATCH 03/50] wip --- apis/go/mlops/scheduler/db/db_ext.go | 121 ++++++++++++++++++ apis/mlops/scheduler/db/db.proto | 6 +- scheduler/pkg/store/in-memory/store.go | 153 ++++++++++++++++++++++ scheduler/pkg/store/memory.go | 169 ++++++++++++------------- scheduler/pkg/store/memory_status.go | 58 +++++---- scheduler/pkg/store/mesh.go | 18 +-- scheduler/pkg/store/storage.go | 21 +-- scheduler/pkg/store/store.go | 15 ++- 8 files changed, 422 insertions(+), 139 deletions(-) create mode 100644 apis/go/mlops/scheduler/db/db_ext.go create mode 100644 scheduler/pkg/store/in-memory/store.go diff --git a/apis/go/mlops/scheduler/db/db_ext.go b/apis/go/mlops/scheduler/db/db_ext.go new file mode 100644 index 0000000000..d4cdbe9425 --- /dev/null +++ b/apis/go/mlops/scheduler/db/db_ext.go @@ -0,0 +1,121 @@ +package db + +import ( + "time" + + "google.golang.org/protobuf/types/known/timestamppb" +) + +func (m *ModelVersion) GetAssignment() []int { + var assignment []int + var draining []int + + for k, v := range m.Replicas { + if v.Status.State == ModelReplicaState_MODEL_REPLICA_STATE_LOADED || + v.Status.State == ModelReplicaState_MODEL_REPLICA_STATE_AVAILABLE || + v.Status.State == ModelReplicaState_MODEL_REPLICA_STATE_LOADED_UNAVAILABLE { + assignment = append(assignment, k) + } + if v.Status.State == ModelReplicaState_MODEL_REPLICA_STATE_DRAINING { + draining = append(draining, k) + } + } + + // prefer assignments that are not draining as envoy is eventual consistent + if len(assignment) > 0 { + return assignment + } + if len(draining) > 0 { + return draining + } + return nil +} + +func (m *ModelVersion) SetReplicaState(replicaIdx int, state ModelReplicaState, reason string) { + m.Replicas[replicaIdx] = &ReplicaStatusEntry{State: state, Timestamp: timestamppb.Now(), Reason: reason} +} + +func (m *Model) Latest() *ModelVersion { + if len(m.Versions) > 0 { + return m.Versions[len(m.Versions)-1] + } + return nil +} + +func (m *Model) HasLatest() bool { + return len(m.Versions) > 0 +} + +func (m *Model) GetVersion(version uint32) *ModelVersion { + for _, mv := range m.Versions { + if mv.GetVersion() == version { + return mv + } + } + return nil +} + +// TODO do we need to consider previous versions? +func (m *Model) Inactive() bool { + return m.Latest().Inactive() +} + +func (m *ModelVersion) Inactive() bool { + for _, v := range m.Replicas { + if !v.Status.State.Inactive() { + return false + } + } + return true +} + +func (m ModelReplicaState) CanReceiveTraffic() bool { + return m == ModelReplicaState_MODEL_REPLICA_STATE_LOADED || + m == ModelReplicaState_MODEL_REPLICA_STATE_AVAILABLE || + m == ModelReplicaState_MODEL_REPLICA_STATE_LOADED_UNAVAILABLE || + m == ModelReplicaState_MODEL_REPLICA_STATE_DRAINING +} + +func (m ModelReplicaState) AlreadyLoadingOrLoaded() bool { + return m == ModelReplicaState_MODEL_REPLICA_STATE_LOADING || + m == ModelReplicaState_MODEL_REPLICA_STATE_LOADED || + m == ModelReplicaState_MODEL_REPLICA_STATE_AVAILABLE || + m == ModelReplicaState_MODEL_REPLICA_STATE_LOADED_UNAVAILABLE +} + +func (m ModelReplicaState) UnloadingOrUnloaded() bool { + return m == ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_ENVOY_REQUESTED || + m == ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_REQUESTED || + m == ModelReplicaState_MODEL_REPLICA_STATE_LOADING || + m == ModelReplicaState_MODEL_REPLICA_STATE_UNLOADED || + m == ModelReplicaState_MODEL_REPLICA_STATE_UNKNOWN +} + +func (m ModelReplicaState) Inactive() bool { + return m == ModelReplicaState_MODEL_REPLICA_STATE_UNLOADED || + m == ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_FAILED || + m == ModelReplicaState_MODEL_REPLICA_STATE_UNKNOWN || + m == ModelReplicaState_MODEL_REPLICA_STATE_LOAD_FAILED +} + +func (m ModelReplicaState) IsLoadingOrLoaded() bool { + return m == ModelReplicaState_MODEL_REPLICA_STATE_LOADED || + m == ModelReplicaState_MODEL_REPLICA_STATE_LOAD_REQUESTED || + m == ModelReplicaState_MODEL_REPLICA_STATE_LOADING || + m == ModelReplicaState_MODEL_REPLICA_STATE_AVAILABLE || m == ModelReplicaState_MODEL_REPLICA_STATE_LOADED_UNAVAILABLE +} + +// +// Servers +// + +func (s *ServerReplica) GetLoadedOrLoadingModelVersions() []*ModelVersionID { + var models []*ModelVersionID + for _, model := range s.LoadingModels { + models = append(models, model.ModelVersionId) + } + for _, model := range s.LoadingModels { + models = append(models, model.ModelVersionId) + } + return models +} diff --git a/apis/mlops/scheduler/db/db.proto b/apis/mlops/scheduler/db/db.proto index 2830bc414f..f647128ff4 100644 --- a/apis/mlops/scheduler/db/db.proto +++ b/apis/mlops/scheduler/db/db.proto @@ -66,15 +66,13 @@ message ReplicaStatusEntry { // ModelVersion represents a version of a model message ModelVersion { - seldon.mlops.scheduler.Model model_defn = 1; // Assuming pb.Model is defined elsewhere + seldon.mlops.scheduler.Model model_defn = 1; uint32 version = 2; string server = 3; - // Using repeated messages since proto doesn't support map directly - repeated ReplicaStatusEntry replicas = 4; + map replicas = 4; ModelStatus state = 5; } - // ModelVersionID uniquely identifies a model version message ModelVersionID { string name = 1; diff --git a/scheduler/pkg/store/in-memory/store.go b/scheduler/pkg/store/in-memory/store.go new file mode 100644 index 0000000000..0e11dd5589 --- /dev/null +++ b/scheduler/pkg/store/in-memory/store.go @@ -0,0 +1,153 @@ +package in_memory + +import ( + "fmt" + "sync" + + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" + "google.golang.org/protobuf/proto" +) + +type Storage struct { + mu sync.RWMutex + servers map[string]*db.Server + models map[string]*db.Model +} + +func NewStorage() *Storage { + return &Storage{ + servers: make(map[string]*db.Server), + models: make(map[string]*db.Model), + } +} + +// GetModel retrieves a model by name, returning a clone to prevent external modification +func (s *Storage) GetModel(name string) (*db.Model, error) { + s.mu.RLock() + defer s.mu.RUnlock() + + model, ok := s.models[name] + if !ok { + return nil, store.ErrNotFound + } + + return proto.Clone(model).(*db.Model), nil +} + +// AddModel adds a new model, storing a clone to prevent external modification +func (s *Storage) AddModel(model *db.Model) error { + s.mu.Lock() + defer s.mu.Unlock() + + if _, ok := s.models[model.Name]; ok { + return fmt.Errorf("model with name %s already exists", model.Name) + } + + s.models[model.Name] = proto.Clone(model).(*db.Model) + return nil +} + +// ListModels returns all models, cloning each to prevent external modification +func (s *Storage) ListModels() ([]*db.Model, error) { + s.mu.RLock() + defer s.mu.RUnlock() + + models := make([]*db.Model, 0, len(s.models)) + for _, model := range s.models { + models = append(models, proto.Clone(model).(*db.Model)) + } + + return models, nil +} + +// UpdateModel updates an existing model, storing a clone to prevent external modification +func (s *Storage) UpdateModel(model *db.Model) error { + s.mu.Lock() + defer s.mu.Unlock() + + if _, ok := s.models[model.Name]; !ok { + return store.ErrNotFound + } + + s.models[model.Name] = proto.Clone(model).(*db.Model) + return nil +} + +// DeleteModel removes a model by name +func (s *Storage) DeleteModel(name string) error { + s.mu.Lock() + defer s.mu.Unlock() + + if _, ok := s.models[name]; !ok { + return store.ErrNotFound + } + + delete(s.models, name) + return nil +} + +// GetServer retrieves a server by name, returning a clone to prevent external modification +func (s *Storage) GetServer(name string) (*db.Server, error) { + s.mu.RLock() + defer s.mu.RUnlock() + + server, ok := s.servers[name] + if !ok { + return nil, store.ErrNotFound + } + + return proto.Clone(server).(*db.Server), nil +} + +// AddServer adds a new server, storing a clone to prevent external modification +func (s *Storage) AddServer(server *db.Server) error { + s.mu.Lock() + defer s.mu.Unlock() + + if _, ok := s.servers[server.Name]; ok { + return fmt.Errorf("server with name %s already exists", server.Name) + } + + s.servers[server.Name] = proto.Clone(server).(*db.Server) + return nil +} + +// ListServers returns all servers, cloning each to prevent external modification +func (s *Storage) ListServers() ([]*db.Server, error) { + s.mu.RLock() + defer s.mu.RUnlock() + + servers := make([]*db.Server, 0, len(s.servers)) + for _, server := range s.servers { + servers = append(servers, proto.Clone(server).(*db.Server)) + } + + return servers, nil +} + +// UpdateServer updates an existing server, storing a clone to prevent external modification +func (s *Storage) UpdateServer(server *db.Server) error { + s.mu.Lock() + defer s.mu.Unlock() + + if _, ok := s.servers[server.Name]; !ok { + return store.ErrNotFound + } + + s.servers[server.Name] = proto.Clone(server).(*db.Server) + return nil +} + +// DeleteServer removes a server by name +func (s *Storage) DeleteServer(serverName string) error { + s.mu.Lock() + defer s.mu.Unlock() + + if _, ok := s.servers[serverName]; !ok { + return store.ErrNotFound + } + + delete(s.servers, serverName) + return nil +} diff --git a/scheduler/pkg/store/memory.go b/scheduler/pkg/store/memory.go index e6a1071795..d15c9d6640 100644 --- a/scheduler/pkg/store/memory.go +++ b/scheduler/pkg/store/memory.go @@ -15,6 +15,7 @@ import ( "sort" "sync" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" log "github.com/sirupsen/logrus" "github.com/seldonio/seldon-core/apis/go/v2/mlops/agent" @@ -57,30 +58,15 @@ func (m *MemoryStore) GetAllModels() ([]string, error) { } for _, model := range models { - modelNames = append(modelNames, model.Name()) + modelNames = append(modelNames, model.GetName()) } return modelNames, nil } -func (m *MemoryStore) GetModels() ([]*ModelSnapshot, error) { +func (m *MemoryStore) GetModels() ([]*db.Model, error) { m.mu.RLock() defer m.mu.RUnlock() - - models, err := m.store.ListModels() - if err != nil { - return nil, err - } - - foundModels := []*ModelSnapshot{} - for _, model := range models { - snapshot := &ModelSnapshot{ - Name: model.Name(), - Deleted: model.IsDeleted(), - Versions: model.versions, - } - foundModels = append(foundModels, snapshot) - } - return foundModels, nil + return m.store.ListModels() } func (m *MemoryStore) addModelVersionIfNotExists(req *agent.ModelVersion) (*Model, *ModelVersion, error) { @@ -108,27 +94,28 @@ func (m *MemoryStore) addModelVersionIfNotExists(req *agent.ModelVersion) (*Mode } } -func (m *MemoryStore) addNextModelVersion(model *Model, pbmodel *pb.Model) { +func (m *MemoryStore) addNextModelVersion(model *db.Model, pbmodel *pb.Model) { // if we start from a clean state, lets use the generation id as the starting version // this is to ensure that we have monotonic increasing version numbers // and we never reset back to 1 generation := pbmodel.GetMeta().GetKubernetesMeta().GetGeneration() version := max(uint32(1), uint32(generation)) if model.Latest() != nil { - version = model.Latest().GetVersion() + 1 + version = model.Versions[len(model.Versions)-1].Version + 1 } modelVersion := NewDefaultModelVersion(pbmodel, version) - model.versions = append(model.versions, modelVersion) - sort.SliceStable(model.versions, func(i, j int) bool { // resort model versions based on version number - return model.versions[i].GetVersion() < model.versions[j].GetVersion() + model.Versions = append(model.Versions, modelVersion) + sort.SliceStable(model.Versions, func(i, j int) bool { // resort model versions based on version number + return model.Versions[i].GetVersion() < model.Versions[j].GetVersion() }) } func (m *MemoryStore) UpdateModel(req *pb.LoadModelRequest) error { - logger := m.logger.WithField("func", "UpdateModel") m.mu.Lock() defer m.mu.Unlock() + + logger := m.logger.WithField("func", "UpdateModel") modelName := req.GetModel().GetMeta().GetName() validName := utils.CheckName(modelName) if !validName { @@ -140,53 +127,65 @@ func (m *MemoryStore) UpdateModel(req *pb.LoadModelRequest) error { model, err := m.store.GetModel(modelName) if err != nil { if !errors.Is(err, ErrNotFound) { - return err - } - // TODO pointer - model = &Model{ - name: modelName, + return fmt.Errorf("failed to get model %s: %w", modelName, err) } + model = &db.Model{Name: modelName} + m.addNextModelVersion(model, req.GetModel()) if err := m.store.AddModel(model); err != nil { - return err + return fmt.Errorf("failed to update model %s: %w", modelName, err) } - m.addNextModelVersion(model, req.GetModel()) - } else if model.IsDeleted() { + return nil + } + + if model.Deleted { if model.Inactive() { - // TODO pointer - model = &Model{ - name: modelName, - } - if err := m.store.AddModel(model); err != nil { - return err - } - m.addNextModelVersion(model, req.GetModel()) - } else { - return fmt.Errorf( - "Model %s is in process of deletion - new model can not be created", - modelName, - ) - } - } else { - meq := ModelEqualityCheck(model.Latest().modelDefn, req.GetModel()) - if meq.Equal { - logger.Debugf("Model %s semantically equal - doing nothing", modelName) - return nil - } else if meq.ModelSpecDiffers { - logger.Debugf("Model %s model spec differs - adding new version of model", modelName) m.addNextModelVersion(model, req.GetModel()) + if err := m.store.UpdateModel(model); err != nil { + return fmt.Errorf("failed to update model %s: %w", modelName, err) + } return nil - } else if meq.DeploymentSpecDiffers { - logger.Debugf( - "Model %s deployment spec differs - updating latest model version with new spec", - modelName, - ) - model.Latest().SetDeploymentSpec(req.GetModel().GetDeploymentSpec()) } - if meq.MetaDiffers { - // Update just kubernetes meta - model.Latest().UpdateKubernetesMeta(req.GetModel().GetMeta().GetKubernetesMeta()) + return fmt.Errorf( + "model %s is in process of deletion - new model can not be created", + modelName, + ) + + } + + latestModel := model.Latest() + if latestModel == nil { + return fmt.Errorf("model %s has no latest version", modelName) + } + + meq := ModelEqualityCheck(latestModel.ModelDefn, req.GetModel()) + changed := false + + switch { + case meq.Equal: + logger.Debugf("Model %s semantically equal - doing nothing", modelName) + case meq.ModelSpecDiffers: + logger.Debugf("Model %s model spec differs - adding new version of model", modelName) + m.addNextModelVersion(model, req.GetModel()) + changed = true + case meq.DeploymentSpecDiffers: + logger.Debugf( + "Model %s deployment spec differs - updating latest model version with new spec", + modelName, + ) + latestModel.ModelDefn.DeploymentSpec = req.GetModel().GetDeploymentSpec() + changed = true + case meq.MetaDiffers: + // Update just kubernetes meta + latestModel.ModelDefn.Meta.KubernetesMeta = req.GetModel().GetMeta().GetKubernetesMeta() + changed = true + } + + if changed { + if err := m.store.UpdateModel(model); err != nil { + return fmt.Errorf("failed to update model %s: %w", modelName, err) } } + return nil } @@ -231,7 +230,7 @@ func (m *MemoryStore) UnlockModel(modelId string) { } } -func (m *MemoryStore) GetModel(key string) (*ModelSnapshot, error) { +func (m *MemoryStore) GetModel(key string) (*db.Model, error) { m.mu.RLock() defer m.mu.RUnlock() return m.getModelImpl(key), nil @@ -248,51 +247,48 @@ func (m *MemoryStore) RemoveModel(req *pb.UnloadModelRequest) error { func (m *MemoryStore) removeModelImpl(req *pb.UnloadModelRequest) error { m.mu.Lock() defer m.mu.Unlock() + modelName := req.GetModel().GetName() model, err := m.store.GetModel(modelName) if err == nil { // Updating the k8s meta is required to be updated so status updates back (to manager) // will match latest generation value. Previous generation values might be ignored by manager. if req.GetKubernetesMeta() != nil { // k8s meta can be nil if unload is called directly using scheduler grpc api - model.Latest().UpdateKubernetesMeta(req.GetKubernetesMeta()) + model.Latest().ModelDefn.Meta.KubernetesMeta = req.GetKubernetesMeta() } - model.SetDeleted() + model.Deleted = true m.setModelGwStatusToTerminate(true, model.Latest()) m.updateModelStatus(true, true, model.Latest(), model.GetLastAvailableModelVersion()) return nil } if errors.Is(err, ErrNotFound) { - return fmt.Errorf("Model %s not found", req.GetModel().GetName()) + return fmt.Errorf("model %s not found", req.GetModel().GetName()) } return err } -func (m *MemoryStore) GetServers(shallow bool, modelDetails bool) ([]*ServerSnapshot, error) { +func (m *MemoryStore) GetServers(shallow bool, modelDetails bool) ([]*db.Server, error) { m.mu.RLock() defer m.mu.RUnlock() servers, err := m.store.ListServers() if err != nil { - return nil, err + return nil, fmt.Errorf("failed to list servers: %w", err) } - snapshots := make([]*ServerSnapshot, 0, len(servers)) - for _, server := range servers { - snapshots = append(snapshots, server.CreateSnapshot(shallow, modelDetails)) - } - return snapshots, nil + return servers, nil } -func (m *MemoryStore) GetServer(serverKey string, shallow bool, modelDetails bool) (*ServerSnapshot, error) { +func (m *MemoryStore) GetServer(serverKey string, shallow bool, modelDetails bool) (*db.Server, error) { m.mu.RLock() defer m.mu.RUnlock() server, err := m.store.GetServer(serverKey) if err != nil { if errors.Is(err, ErrNotFound) { - return nil, fmt.Errorf("Server [%s] not found", serverKey) + return nil, fmt.Errorf("server [%s] not found", serverKey) } return nil, err } @@ -306,6 +302,7 @@ func (m *MemoryStore) GetServer(serverKey string, shallow bool, modelDetails boo return nil, err } } + return snapshot, nil } @@ -351,7 +348,7 @@ func (m *MemoryStore) UpdateLoadedModels( modelKey string, version uint32, serverKey string, - replicas []*ServerReplica, + replicas []*db.ServerReplica, ) error { m.mu.Lock() modelEvt, err := m.updateLoadedModelsImpl(modelKey, version, serverKey, replicas) @@ -556,8 +553,8 @@ func (m *MemoryStore) UpdateModelState( serverKey string, replicaIdx int, availableMemory *uint64, - expectedState ModelReplicaState, - desiredState ModelReplicaState, + expectedState db.ModelReplicaState, + desiredState db.ModelReplicaState, reason string, runtimeInfo *pb.ModelRuntimeInfo, ) error { @@ -825,7 +822,7 @@ func (m *MemoryStore) removeModelfromServerReplica(lModels map[ModelVersionID]bo modelVersion := model.GetVersion(modelVersionID.Version) if modelVersion != nil { modelVersion.DeleteReplica(replicaIdx) - if model.IsDeleted() || model.Latest().GetVersion() != modelVersion.GetVersion() { + if model.Deleted || model.Latest().GetVersion() != modelVersion.GetVersion() { // In some cases we found that the user can ask for a model to be deleted and the model replica // is still in the process of being loaded. In this case we should not reschedule the model. logger.Debugf( @@ -871,19 +868,19 @@ func (m *MemoryStore) drainServerReplicaImpl(serverName string, replicaIdx int) if err != nil { return nil, fmt.Errorf("Failed to find server %s", serverName) } - serverReplica, ok := server.replicas[replicaIdx] + serverReplica, ok := server.Replicas[int32(replicaIdx)] if !ok { - return nil, fmt.Errorf("Failed to find replica %d for server %s", replicaIdx, serverName) + return nil, fmt.Errorf("failed to find replica %d for server %s", replicaIdx, serverName) } // we mark this server replica as draining so should not be used in future scheduling decisions - serverReplica.SetIsDraining() + serverReplica.IsDraining = true - loadedModels := m.findModelsToReSchedule(serverReplica.loadedModels, replicaIdx) + loadedModels := m.findModelsToReSchedule(serverReplica.LoadingModels, replicaIdx) if len(loadedModels) > 0 { logger.WithField("models", loadedModels).Debug("Found loaded models to re-schedule") } - loadingModels := m.findModelsToReSchedule(serverReplica.loadingModels, replicaIdx) + loadingModels := m.findModelsToReSchedule(serverReplica.LoadingModels, replicaIdx) if len(loadingModels) > 0 { logger.WithField("models", loadingModels).Debug("Found loading models to re-schedule") } @@ -946,7 +943,7 @@ func (m *MemoryStore) numEmptyServerReplicas(serverName string) (uint32, error) } return emptyReplicas, nil } - for _, replica := range server.replicas { + for _, replica := range server.Replicas { if len(replica.GetLoadedOrLoadingModelVersions()) == 0 { emptyReplicas++ } @@ -982,7 +979,7 @@ func toSchedulerLoadedModels(agentLoadedModels []*agent.ModelVersion) map[ModelV return loadedModels } -func (m *MemoryStore) SetModelGwModelState(name string, versionNumber uint32, status ModelState, reason string, source string) error { +func (m *MemoryStore) SetModelGwModelState(name string, versionNumber uint32, status db.ModelState, reason string, source string) error { logger := m.logger.WithField("func", "SetModelGwModelState") logger.Debugf("Attempt to set model-gw state on model %s:%d status:%s", name, versionNumber, status.String()) diff --git a/scheduler/pkg/store/memory_status.go b/scheduler/pkg/store/memory_status.go index 13ec647bb6..989caef9d6 100644 --- a/scheduler/pkg/store/memory_status.go +++ b/scheduler/pkg/store/memory_status.go @@ -13,7 +13,9 @@ import ( "fmt" "time" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" + "google.golang.org/protobuf/types/known/timestamppb" ) const ( @@ -115,36 +117,40 @@ func (m *MemoryStore) FailedScheduling(modelID string, version uint32, reason st m.mu.Lock() defer m.mu.Unlock() - model, ok := m.store.models[modelID] - if !ok { - return fmt.Errorf("model %s not found", modelID) + model, err := m.store.GetModel(modelID) + if err != nil { + return fmt.Errorf("model %s not found: %w", modelID, err) } // likely the failed model version is the latest, so we loop through in reverse order - for i := len(model.versions) - 1; i >= 0; i-- { - modelVersion := model.versions[i] + for i := len(model.Versions) - 1; i >= 0; i-- { + modelVersion := model.Versions[i] - if modelVersion.version == version { + if modelVersion.Version == version { // we use len of GetAssignment instead of .state.AvailableReplicas as it is more accurate in this context availableReplicas := uint32(len(modelVersion.GetAssignment())) - modelVersion.state = ModelStatus{ - State: ScheduleFailed, - ModelGwState: modelVersion.state.ModelGwState, + modelVersion.State = &db.ModelStatus{ + State: db.ModelState_MODEL_STATE_SCHEDULE_FAILED, + ModelGwState: modelVersion.State.ModelGwState, Reason: reason, - ModelGwReason: modelVersion.state.ModelGwReason, - Timestamp: time.Now(), + ModelGwReason: modelVersion.State.ModelGwReason, + Timestamp: timestamppb.Now(), AvailableReplicas: availableReplicas, - UnavailableReplicas: modelVersion.GetModel().GetDeploymentSpec().GetReplicas() - availableReplicas, + UnavailableReplicas: modelVersion.ModelDefn.GetDeploymentSpec().GetReplicas() - availableReplicas, } // make sure we reset server but only if there are no available replicas if reset { - modelVersion.SetServer("") + modelVersion.Server = "" + } + + if err := m.store.UpdateModel(model); err != nil { + return fmt.Errorf("failed to update model %s: %w", modelID, err) } m.eventHub.PublishModelEvent( modelFailureEventSource, coordinator.ModelEventMsg{ - ModelName: modelVersion.GetMeta().GetName(), + ModelName: modelVersion.ModelDefn.Meta.Name, ModelVersion: modelVersion.GetVersion(), }, ) @@ -164,24 +170,23 @@ func (m *MemoryStore) updateModelStatus(isLatest bool, deleted bool, modelVersio updateModelState(isLatest, modelVersion, prevModelVersion, stats, deleted) } -func (m *MemoryStore) setModelGwStatusToTerminate(isLatest bool, modelVersion *ModelVersion) { +func (m *MemoryStore) setModelGwStatusToTerminate(isLatest bool, modelVersion *db.ModelVersion) { if !isLatest { - modelVersion.state.ModelGwState = ModelTerminated - modelVersion.state.ModelGwReason = "Not latest version" - } else { - modelVersion.state.ModelGwState = ModelTerminate - modelVersion.state.ModelGwReason = "Model deleted" + modelVersion.State.ModelGwState = db.ModelState_MODEL_STATE_TERMINATED + modelVersion.State.ModelGwReason = "Not latest version" + return } + modelVersion.State.ModelGwState = db.ModelState_MODEL_STATE_TERMINATE + modelVersion.State.ModelGwReason = "Model deleted" } func (m *MemoryStore) UnloadModelGwVersionModels(modelKey string, version uint32) (bool, error) { m.mu.Lock() defer m.mu.Unlock() - fmt.Println("UnloadModelGwVersionModels called for ", modelKey, " version ", version) - model, ok := m.store.models[modelKey] - if !ok { - return false, fmt.Errorf("failed to find model %s", modelKey) + model, err := m.store.GetModel(modelKey) + if err != nil { + return false, fmt.Errorf("failed to find model %s: %w", modelKey, err) } modelVersion := model.GetVersion(version) @@ -190,5 +195,10 @@ func (m *MemoryStore) UnloadModelGwVersionModels(modelKey string, version uint32 } m.setModelGwStatusToTerminate(false, modelVersion) + + if err := m.store.UpdateModel(model); err != nil { + return false, fmt.Errorf("failed to update model %s: %w", modelKey, err) + } + return true, nil } diff --git a/scheduler/pkg/store/mesh.go b/scheduler/pkg/store/mesh.go index 524400bc8d..80edf3e399 100644 --- a/scheduler/pkg/store/mesh.go +++ b/scheduler/pkg/store/mesh.go @@ -16,6 +16,7 @@ import ( "sync/atomic" "time" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "google.golang.org/protobuf/proto" pba "github.com/seldonio/seldon-core/apis/go/v2/mlops/agent" @@ -78,16 +79,15 @@ type ReplicaStatus struct { Timestamp time.Time } -func NewDefaultModelVersion(model *pb.Model, version uint32) *ModelVersion { - return &ModelVersion{ - version: version, - modelDefn: model, - replicas: make(map[int]ReplicaStatus), - state: ModelStatus{ - State: ModelStateUnknown, - ModelGwState: ModelCreate, +func NewDefaultModelVersion(model *pb.Model, version uint32) *db.ModelVersion { + return &db.ModelVersion{ + Version: version, + ModelDefn: model, + Replicas: make([]*db.ReplicaStatusEntry, 0), + State: &db.ModelStatus{ + State: db.ModelState_MODEL_STATE_UNKNOWN, + ModelGwState: db.ModelState_MODEL_STATE_CREATE, }, - mu: sync.RWMutex{}, } } diff --git a/scheduler/pkg/store/storage.go b/scheduler/pkg/store/storage.go index 4dba69ed2e..95ccd197e4 100644 --- a/scheduler/pkg/store/storage.go +++ b/scheduler/pkg/store/storage.go @@ -9,21 +9,24 @@ the Change License after the Change Date as each is defined in accordance with t package store -import "errors" +import ( + "errors" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" +) var ( ErrNotFound = errors.New("not found") ) type Storage interface { - GetModel(name string) (*Model, error) - AddModel(model *Model) error - ListModels() ([]*Model, error) - UpdateModel(model *Model) error + GetModel(name string) (*db.Model, error) + AddModel(model *db.Model) error + ListModels() ([]*db.Model, error) + UpdateModel(model *db.Model) error DeleteModel(name string) error - GetServer(name string) (*Server, error) - AddServer(server *Server) error - ListServers() ([]*Server, error) - UpdateServer(server *Server) error + GetServer(name string) (*db.Server, error) + AddServer(server *db.Server) error + ListServers() ([]*db.Server, error) + UpdateServer(server *db.Server) error DeleteServer(serverName string) error } diff --git a/scheduler/pkg/store/store.go b/scheduler/pkg/store/store.go index 48c5574bb9..07b6d843d3 100644 --- a/scheduler/pkg/store/store.go +++ b/scheduler/pkg/store/store.go @@ -12,6 +12,7 @@ package store import ( pba "github.com/seldonio/seldon-core/apis/go/v2/mlops/agent" pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" ) type ServerSnapshot struct { @@ -138,23 +139,23 @@ func (m *ModelSnapshot) GetVersionsBeforeLastModelGwAvailable() []*ModelVersion //go:generate go tool mockgen -source=./store.go -destination=./mock/store.go -package=mock ModelStore type ModelStore interface { UpdateModel(config *pb.LoadModelRequest) error - GetModel(key string) (*ModelSnapshot, error) - GetModels() ([]*ModelSnapshot, error) + GetModel(key string) (*db.Model, error) + GetModels() ([]*db.Model, error) LockModel(modelId string) UnlockModel(modelId string) RemoveModel(req *pb.UnloadModelRequest) error - GetServers(shallow bool, modelDetails bool) ([]*ServerSnapshot, error) - GetServer(serverKey string, shallow bool, modelDetails bool) (*ServerSnapshot, error) - UpdateLoadedModels(modelKey string, version uint32, serverKey string, replicas []*ServerReplica) error + GetServers(shallow bool, modelDetails bool) ([]*db.Server, error) + GetServer(serverKey string, shallow bool, modelDetails bool) (*db.Server, error) + UpdateLoadedModels(modelKey string, version uint32, serverKey string, replicas []*db.ServerReplica) error UnloadVersionModels(modelKey string, version uint32) (bool, error) UnloadModelGwVersionModels(modelKey string, version uint32) (bool, error) - UpdateModelState(modelKey string, version uint32, serverKey string, replicaIdx int, availableMemory *uint64, expectedState, desiredState ModelReplicaState, reason string, runtimeInfo *pb.ModelRuntimeInfo) error + UpdateModelState(modelKey string, version uint32, serverKey string, replicaIdx int, availableMemory *uint64, expectedState, desiredState db.ModelReplicaState, reason string, runtimeInfo *pb.ModelRuntimeInfo) error AddServerReplica(request *pba.AgentSubscribeRequest) error ServerNotify(request *pb.ServerNotify) error RemoveServerReplica(serverName string, replicaIdx int) ([]string, error) // return previously loaded models DrainServerReplica(serverName string, replicaIdx int) ([]string, error) // return previously loaded models FailedScheduling(modelID string, version uint32, reason string, reset bool) error GetAllModels() ([]string, error) - SetModelGwModelState(name string, versionNumber uint32, status ModelState, reason string, source string) error + SetModelGwModelState(name string, versionNumber uint32, status db.ModelState, reason string, source string) error EmitEvents() error } From 663ba91e568fba3ef8a1806cd7194b89b0aa626a Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Mon, 29 Dec 2025 22:36:57 +0000 Subject: [PATCH 04/50] abstracted storage, now uses proto to store model/servers --- apis/Makefile | 1 - apis/go/mlops/agent/agent.pb.go | 9 - apis/go/mlops/agent/agent_grpc.pb.go | 9 - apis/go/mlops/agent_debug/agent_debug.pb.go | 9 - .../mlops/agent_debug/agent_debug_grpc.pb.go | 9 - apis/go/mlops/chainer/chainer.pb.go | 9 - apis/go/mlops/chainer/chainer_grpc.pb.go | 9 - apis/go/mlops/health/health.pb.go | 9 - apis/go/mlops/health/health_grpc.pb.go | 9 - apis/go/mlops/proxy/proxy.pb.go | 9 - apis/go/mlops/proxy/proxy_grpc.pb.go | 9 - apis/go/mlops/scheduler/db/db.pb.go | 620 +++++++++--------- apis/go/mlops/scheduler/db/db_ext.go | 121 ---- apis/go/mlops/scheduler/db/model_ext.go | 181 +++++ apis/go/mlops/scheduler/db/server_ext.go | 83 +++ apis/go/mlops/scheduler/scheduler.pb.go | 9 - apis/go/mlops/scheduler/scheduler_grpc.pb.go | 9 - apis/go/mlops/scheduler/storage.pb.go | 273 -------- apis/mlops/scheduler/db/db.proto | 42 +- apis/mlops/scheduler/storage.proto | 23 - 20 files changed, 610 insertions(+), 842 deletions(-) delete mode 100644 apis/go/mlops/scheduler/db/db_ext.go create mode 100644 apis/go/mlops/scheduler/db/model_ext.go create mode 100644 apis/go/mlops/scheduler/db/server_ext.go delete mode 100644 apis/go/mlops/scheduler/storage.pb.go delete mode 100644 apis/mlops/scheduler/storage.proto diff --git a/apis/Makefile b/apis/Makefile index 4f75133c1f..1a903a95df 100644 --- a/apis/Makefile +++ b/apis/Makefile @@ -162,7 +162,6 @@ build-go: \ ./mlops/agent_debug/agent_debug.proto \ ./mlops/proxy/proxy.proto \ ./mlops/scheduler/scheduler.proto \ - ./mlops/scheduler/storage.proto \ ./mlops/scheduler/db/db.proto \ ./mlops/chainer/chainer.proto \ ./mlops/v2_dataplane/v2_dataplane.proto \ diff --git a/apis/go/mlops/agent/agent.pb.go b/apis/go/mlops/agent/agent.pb.go index 855bd30c5b..e3b764b9ec 100644 --- a/apis/go/mlops/agent/agent.pb.go +++ b/apis/go/mlops/agent/agent.pb.go @@ -1,12 +1,3 @@ -/* -Copyright (c) 2024 Seldon Technologies Ltd. - -Use of this software is governed BY -(1) the license included in the LICENSE file or -(2) if the license included in the LICENSE file is the Business Source License 1.1, -the Change License after the Change Date as each is defined in accordance with the LICENSE file. -*/ - // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.34.2 diff --git a/apis/go/mlops/agent/agent_grpc.pb.go b/apis/go/mlops/agent/agent_grpc.pb.go index e82793408c..da32efd503 100644 --- a/apis/go/mlops/agent/agent_grpc.pb.go +++ b/apis/go/mlops/agent/agent_grpc.pb.go @@ -1,12 +1,3 @@ -/* -Copyright (c) 2024 Seldon Technologies Ltd. - -Use of this software is governed BY -(1) the license included in the LICENSE file or -(2) if the license included in the LICENSE file is the Business Source License 1.1, -the Change License after the Change Date as each is defined in accordance with the LICENSE file. -*/ - // Code generated by protoc-gen-go-grpc. DO NOT EDIT. // versions: // - protoc-gen-go-grpc v1.4.0 diff --git a/apis/go/mlops/agent_debug/agent_debug.pb.go b/apis/go/mlops/agent_debug/agent_debug.pb.go index 9605182cd8..cbff1f39c3 100644 --- a/apis/go/mlops/agent_debug/agent_debug.pb.go +++ b/apis/go/mlops/agent_debug/agent_debug.pb.go @@ -1,12 +1,3 @@ -/* -Copyright (c) 2024 Seldon Technologies Ltd. - -Use of this software is governed BY -(1) the license included in the LICENSE file or -(2) if the license included in the LICENSE file is the Business Source License 1.1, -the Change License after the Change Date as each is defined in accordance with the LICENSE file. -*/ - // Debug service for agent replica // Code generated by protoc-gen-go. DO NOT EDIT. diff --git a/apis/go/mlops/agent_debug/agent_debug_grpc.pb.go b/apis/go/mlops/agent_debug/agent_debug_grpc.pb.go index dfb4269ef2..0dd305f361 100644 --- a/apis/go/mlops/agent_debug/agent_debug_grpc.pb.go +++ b/apis/go/mlops/agent_debug/agent_debug_grpc.pb.go @@ -1,12 +1,3 @@ -/* -Copyright (c) 2024 Seldon Technologies Ltd. - -Use of this software is governed BY -(1) the license included in the LICENSE file or -(2) if the license included in the LICENSE file is the Business Source License 1.1, -the Change License after the Change Date as each is defined in accordance with the LICENSE file. -*/ - // Debug service for agent replica // Code generated by protoc-gen-go-grpc. DO NOT EDIT. diff --git a/apis/go/mlops/chainer/chainer.pb.go b/apis/go/mlops/chainer/chainer.pb.go index c31ae7b569..0eec999a48 100644 --- a/apis/go/mlops/chainer/chainer.pb.go +++ b/apis/go/mlops/chainer/chainer.pb.go @@ -1,12 +1,3 @@ -/* -Copyright (c) 2024 Seldon Technologies Ltd. - -Use of this software is governed BY -(1) the license included in the LICENSE file or -(2) if the license included in the LICENSE file is the Business Source License 1.1, -the Change License after the Change Date as each is defined in accordance with the LICENSE file. -*/ - // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.34.2 diff --git a/apis/go/mlops/chainer/chainer_grpc.pb.go b/apis/go/mlops/chainer/chainer_grpc.pb.go index a4974064f7..28defc847b 100644 --- a/apis/go/mlops/chainer/chainer_grpc.pb.go +++ b/apis/go/mlops/chainer/chainer_grpc.pb.go @@ -1,12 +1,3 @@ -/* -Copyright (c) 2024 Seldon Technologies Ltd. - -Use of this software is governed BY -(1) the license included in the LICENSE file or -(2) if the license included in the LICENSE file is the Business Source License 1.1, -the Change License after the Change Date as each is defined in accordance with the LICENSE file. -*/ - // Code generated by protoc-gen-go-grpc. DO NOT EDIT. // versions: // - protoc-gen-go-grpc v1.4.0 diff --git a/apis/go/mlops/health/health.pb.go b/apis/go/mlops/health/health.pb.go index 11ef4b5936..7340c165d2 100644 --- a/apis/go/mlops/health/health.pb.go +++ b/apis/go/mlops/health/health.pb.go @@ -1,12 +1,3 @@ -/* -Copyright (c) 2024 Seldon Technologies Ltd. - -Use of this software is governed BY -(1) the license included in the LICENSE file or -(2) if the license included in the LICENSE file is the Business Source License 1.1, -the Change License after the Change Date as each is defined in accordance with the LICENSE file. -*/ - // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.34.2 diff --git a/apis/go/mlops/health/health_grpc.pb.go b/apis/go/mlops/health/health_grpc.pb.go index 7c5c501b13..a5d1c62e18 100644 --- a/apis/go/mlops/health/health_grpc.pb.go +++ b/apis/go/mlops/health/health_grpc.pb.go @@ -1,12 +1,3 @@ -/* -Copyright (c) 2024 Seldon Technologies Ltd. - -Use of this software is governed BY -(1) the license included in the LICENSE file or -(2) if the license included in the LICENSE file is the Business Source License 1.1, -the Change License after the Change Date as each is defined in accordance with the LICENSE file. -*/ - // Code generated by protoc-gen-go-grpc. DO NOT EDIT. // versions: // - protoc-gen-go-grpc v1.4.0 diff --git a/apis/go/mlops/proxy/proxy.pb.go b/apis/go/mlops/proxy/proxy.pb.go index f35d13234d..bda6bf09ff 100644 --- a/apis/go/mlops/proxy/proxy.pb.go +++ b/apis/go/mlops/proxy/proxy.pb.go @@ -1,12 +1,3 @@ -/* -Copyright (c) 2024 Seldon Technologies Ltd. - -Use of this software is governed BY -(1) the license included in the LICENSE file or -(2) if the license included in the LICENSE file is the Business Source License 1.1, -the Change License after the Change Date as each is defined in accordance with the LICENSE file. -*/ - // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.34.2 diff --git a/apis/go/mlops/proxy/proxy_grpc.pb.go b/apis/go/mlops/proxy/proxy_grpc.pb.go index 528dbec929..de21a28d47 100644 --- a/apis/go/mlops/proxy/proxy_grpc.pb.go +++ b/apis/go/mlops/proxy/proxy_grpc.pb.go @@ -1,12 +1,3 @@ -/* -Copyright (c) 2024 Seldon Technologies Ltd. - -Use of this software is governed BY -(1) the license included in the LICENSE file or -(2) if the license included in the LICENSE file is the Business Source License 1.1, -the Change License after the Change Date as each is defined in accordance with the LICENSE file. -*/ - // Code generated by protoc-gen-go-grpc. DO NOT EDIT. // versions: // - protoc-gen-go-grpc v1.4.0 diff --git a/apis/go/mlops/scheduler/db/db.pb.go b/apis/go/mlops/scheduler/db/db.pb.go index 30f89c659a..d95f186522 100644 --- a/apis/go/mlops/scheduler/db/db.pb.go +++ b/apis/go/mlops/scheduler/db/db.pb.go @@ -1,12 +1,3 @@ -/* -Copyright (c) 2024 Seldon Technologies Ltd. - -Use of this software is governed BY -(1) the license included in the LICENSE file or -(2) if the license included in the LICENSE file is the Business Source License 1.1, -the Change License after the Change Date as each is defined in accordance with the LICENSE file. -*/ - // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.34.2 @@ -185,6 +176,135 @@ func (ModelReplicaState) EnumDescriptor() ([]byte, []int) { return file_mlops_scheduler_db_db_proto_rawDescGZIP(), []int{1} } +type PipelineSnapshot struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` + LastVersion uint32 `protobuf:"varint,2,opt,name=lastVersion,proto3" json:"lastVersion,omitempty"` + Versions []*scheduler.PipelineWithState `protobuf:"bytes,3,rep,name=versions,proto3" json:"versions,omitempty"` + Deleted bool `protobuf:"varint,4,opt,name=deleted,proto3" json:"deleted,omitempty"` +} + +func (x *PipelineSnapshot) Reset() { + *x = PipelineSnapshot{} + if protoimpl.UnsafeEnabled { + mi := &file_mlops_scheduler_db_db_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *PipelineSnapshot) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*PipelineSnapshot) ProtoMessage() {} + +func (x *PipelineSnapshot) ProtoReflect() protoreflect.Message { + mi := &file_mlops_scheduler_db_db_proto_msgTypes[0] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use PipelineSnapshot.ProtoReflect.Descriptor instead. +func (*PipelineSnapshot) Descriptor() ([]byte, []int) { + return file_mlops_scheduler_db_db_proto_rawDescGZIP(), []int{0} +} + +func (x *PipelineSnapshot) GetName() string { + if x != nil { + return x.Name + } + return "" +} + +func (x *PipelineSnapshot) GetLastVersion() uint32 { + if x != nil { + return x.LastVersion + } + return 0 +} + +func (x *PipelineSnapshot) GetVersions() []*scheduler.PipelineWithState { + if x != nil { + return x.Versions + } + return nil +} + +func (x *PipelineSnapshot) GetDeleted() bool { + if x != nil { + return x.Deleted + } + return false +} + +type ExperimentSnapshot struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Experiment *scheduler.Experiment `protobuf:"bytes,1,opt,name=experiment,proto3" json:"experiment,omitempty"` + // to mark the experiment as deleted, this is currently required as we persist all + // experiments in the local scheduler state (badgerdb) so that events can be replayed + // on restart, which would guard against lost events in communication. + Deleted bool `protobuf:"varint,2,opt,name=deleted,proto3" json:"deleted,omitempty"` +} + +func (x *ExperimentSnapshot) Reset() { + *x = ExperimentSnapshot{} + if protoimpl.UnsafeEnabled { + mi := &file_mlops_scheduler_db_db_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *ExperimentSnapshot) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ExperimentSnapshot) ProtoMessage() {} + +func (x *ExperimentSnapshot) ProtoReflect() protoreflect.Message { + mi := &file_mlops_scheduler_db_db_proto_msgTypes[1] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ExperimentSnapshot.ProtoReflect.Descriptor instead. +func (*ExperimentSnapshot) Descriptor() ([]byte, []int) { + return file_mlops_scheduler_db_db_proto_rawDescGZIP(), []int{1} +} + +func (x *ExperimentSnapshot) GetExperiment() *scheduler.Experiment { + if x != nil { + return x.Experiment + } + return nil +} + +func (x *ExperimentSnapshot) GetDeleted() bool { + if x != nil { + return x.Deleted + } + return false +} + // ReplicaStatus represents the status of a replica type ReplicaStatus struct { state protoimpl.MessageState @@ -199,7 +319,7 @@ type ReplicaStatus struct { func (x *ReplicaStatus) Reset() { *x = ReplicaStatus{} if protoimpl.UnsafeEnabled { - mi := &file_mlops_scheduler_db_db_proto_msgTypes[0] + mi := &file_mlops_scheduler_db_db_proto_msgTypes[2] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -212,7 +332,7 @@ func (x *ReplicaStatus) String() string { func (*ReplicaStatus) ProtoMessage() {} func (x *ReplicaStatus) ProtoReflect() protoreflect.Message { - mi := &file_mlops_scheduler_db_db_proto_msgTypes[0] + mi := &file_mlops_scheduler_db_db_proto_msgTypes[2] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -225,7 +345,7 @@ func (x *ReplicaStatus) ProtoReflect() protoreflect.Message { // Deprecated: Use ReplicaStatus.ProtoReflect.Descriptor instead. func (*ReplicaStatus) Descriptor() ([]byte, []int) { - return file_mlops_scheduler_db_db_proto_rawDescGZIP(), []int{0} + return file_mlops_scheduler_db_db_proto_rawDescGZIP(), []int{2} } func (x *ReplicaStatus) GetState() ModelReplicaState { @@ -268,7 +388,7 @@ type ModelStatus struct { func (x *ModelStatus) Reset() { *x = ModelStatus{} if protoimpl.UnsafeEnabled { - mi := &file_mlops_scheduler_db_db_proto_msgTypes[1] + mi := &file_mlops_scheduler_db_db_proto_msgTypes[3] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -281,7 +401,7 @@ func (x *ModelStatus) String() string { func (*ModelStatus) ProtoMessage() {} func (x *ModelStatus) ProtoReflect() protoreflect.Message { - mi := &file_mlops_scheduler_db_db_proto_msgTypes[1] + mi := &file_mlops_scheduler_db_db_proto_msgTypes[3] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -294,7 +414,7 @@ func (x *ModelStatus) ProtoReflect() protoreflect.Message { // Deprecated: Use ModelStatus.ProtoReflect.Descriptor instead. func (*ModelStatus) Descriptor() ([]byte, []int) { - return file_mlops_scheduler_db_db_proto_rawDescGZIP(), []int{1} + return file_mlops_scheduler_db_db_proto_rawDescGZIP(), []int{3} } func (x *ModelStatus) GetState() ModelState { @@ -353,80 +473,23 @@ func (x *ModelStatus) GetTimestamp() *timestamppb.Timestamp { return nil } -// ReplicaStatusEntry for map representation -type ReplicaStatusEntry struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - ReplicaIndex int32 `protobuf:"varint,1,opt,name=replica_index,json=replicaIndex,proto3" json:"replica_index,omitempty"` - Status *ReplicaStatus `protobuf:"bytes,2,opt,name=status,proto3" json:"status,omitempty"` -} - -func (x *ReplicaStatusEntry) Reset() { - *x = ReplicaStatusEntry{} - if protoimpl.UnsafeEnabled { - mi := &file_mlops_scheduler_db_db_proto_msgTypes[2] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *ReplicaStatusEntry) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*ReplicaStatusEntry) ProtoMessage() {} - -func (x *ReplicaStatusEntry) ProtoReflect() protoreflect.Message { - mi := &file_mlops_scheduler_db_db_proto_msgTypes[2] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use ReplicaStatusEntry.ProtoReflect.Descriptor instead. -func (*ReplicaStatusEntry) Descriptor() ([]byte, []int) { - return file_mlops_scheduler_db_db_proto_rawDescGZIP(), []int{2} -} - -func (x *ReplicaStatusEntry) GetReplicaIndex() int32 { - if x != nil { - return x.ReplicaIndex - } - return 0 -} - -func (x *ReplicaStatusEntry) GetStatus() *ReplicaStatus { - if x != nil { - return x.Status - } - return nil -} - // ModelVersion represents a version of a model type ModelVersion struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields - ModelDefn *scheduler.Model `protobuf:"bytes,1,opt,name=model_defn,json=modelDefn,proto3" json:"model_defn,omitempty"` // Assuming pb.Model is defined elsewhere - Version uint32 `protobuf:"varint,2,opt,name=version,proto3" json:"version,omitempty"` - Server string `protobuf:"bytes,3,opt,name=server,proto3" json:"server,omitempty"` - // Using repeated messages since proto doesn't support map directly - Replicas []*ReplicaStatusEntry `protobuf:"bytes,4,rep,name=replicas,proto3" json:"replicas,omitempty"` - State *ModelStatus `protobuf:"bytes,5,opt,name=state,proto3" json:"state,omitempty"` + ModelDefn *scheduler.Model `protobuf:"bytes,1,opt,name=model_defn,json=modelDefn,proto3" json:"model_defn,omitempty"` + Version uint32 `protobuf:"varint,2,opt,name=version,proto3" json:"version,omitempty"` + Server string `protobuf:"bytes,3,opt,name=server,proto3" json:"server,omitempty"` + Replicas map[int32]*ReplicaStatus `protobuf:"bytes,4,rep,name=replicas,proto3" json:"replicas,omitempty" protobuf_key:"varint,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` + State *ModelStatus `protobuf:"bytes,5,opt,name=state,proto3" json:"state,omitempty"` } func (x *ModelVersion) Reset() { *x = ModelVersion{} if protoimpl.UnsafeEnabled { - mi := &file_mlops_scheduler_db_db_proto_msgTypes[3] + mi := &file_mlops_scheduler_db_db_proto_msgTypes[4] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -439,7 +502,7 @@ func (x *ModelVersion) String() string { func (*ModelVersion) ProtoMessage() {} func (x *ModelVersion) ProtoReflect() protoreflect.Message { - mi := &file_mlops_scheduler_db_db_proto_msgTypes[3] + mi := &file_mlops_scheduler_db_db_proto_msgTypes[4] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -452,7 +515,7 @@ func (x *ModelVersion) ProtoReflect() protoreflect.Message { // Deprecated: Use ModelVersion.ProtoReflect.Descriptor instead. func (*ModelVersion) Descriptor() ([]byte, []int) { - return file_mlops_scheduler_db_db_proto_rawDescGZIP(), []int{3} + return file_mlops_scheduler_db_db_proto_rawDescGZIP(), []int{4} } func (x *ModelVersion) GetModelDefn() *scheduler.Model { @@ -476,7 +539,7 @@ func (x *ModelVersion) GetServer() string { return "" } -func (x *ModelVersion) GetReplicas() []*ReplicaStatusEntry { +func (x *ModelVersion) GetReplicas() map[int32]*ReplicaStatus { if x != nil { return x.Replicas } @@ -503,7 +566,7 @@ type ModelVersionID struct { func (x *ModelVersionID) Reset() { *x = ModelVersionID{} if protoimpl.UnsafeEnabled { - mi := &file_mlops_scheduler_db_db_proto_msgTypes[4] + mi := &file_mlops_scheduler_db_db_proto_msgTypes[5] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -516,7 +579,7 @@ func (x *ModelVersionID) String() string { func (*ModelVersionID) ProtoMessage() {} func (x *ModelVersionID) ProtoReflect() protoreflect.Message { - mi := &file_mlops_scheduler_db_db_proto_msgTypes[4] + mi := &file_mlops_scheduler_db_db_proto_msgTypes[5] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -529,7 +592,7 @@ func (x *ModelVersionID) ProtoReflect() protoreflect.Message { // Deprecated: Use ModelVersionID.ProtoReflect.Descriptor instead. func (*ModelVersionID) Descriptor() ([]byte, []int) { - return file_mlops_scheduler_db_db_proto_rawDescGZIP(), []int{4} + return file_mlops_scheduler_db_db_proto_rawDescGZIP(), []int{5} } func (x *ModelVersionID) GetName() string { @@ -560,7 +623,7 @@ type Model struct { func (x *Model) Reset() { *x = Model{} if protoimpl.UnsafeEnabled { - mi := &file_mlops_scheduler_db_db_proto_msgTypes[5] + mi := &file_mlops_scheduler_db_db_proto_msgTypes[6] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -573,7 +636,7 @@ func (x *Model) String() string { func (*Model) ProtoMessage() {} func (x *Model) ProtoReflect() protoreflect.Message { - mi := &file_mlops_scheduler_db_db_proto_msgTypes[5] + mi := &file_mlops_scheduler_db_db_proto_msgTypes[6] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -586,7 +649,7 @@ func (x *Model) ProtoReflect() protoreflect.Message { // Deprecated: Use Model.ProtoReflect.Descriptor instead. func (*Model) Descriptor() ([]byte, []int) { - return file_mlops_scheduler_db_db_proto_rawDescGZIP(), []int{5} + return file_mlops_scheduler_db_db_proto_rawDescGZIP(), []int{6} } func (x *Model) GetName() string { @@ -628,7 +691,7 @@ type Server struct { func (x *Server) Reset() { *x = Server{} if protoimpl.UnsafeEnabled { - mi := &file_mlops_scheduler_db_db_proto_msgTypes[6] + mi := &file_mlops_scheduler_db_db_proto_msgTypes[7] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } @@ -641,7 +704,7 @@ func (x *Server) String() string { func (*Server) ProtoMessage() {} func (x *Server) ProtoReflect() protoreflect.Message { - mi := &file_mlops_scheduler_db_db_proto_msgTypes[6] + mi := &file_mlops_scheduler_db_db_proto_msgTypes[7] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { @@ -654,7 +717,7 @@ func (x *Server) ProtoReflect() protoreflect.Message { // Deprecated: Use Server.ProtoReflect.Descriptor instead. func (*Server) Descriptor() ([]byte, []int) { - return file_mlops_scheduler_db_db_proto_rawDescGZIP(), []int{6} + return file_mlops_scheduler_db_db_proto_rawDescGZIP(), []int{7} } func (x *Server) GetName() string { @@ -706,62 +769,6 @@ func (x *Server) GetKubernetesMeta() *scheduler.KubernetesMeta { return nil } -// ModelLoadStatus represents a model's load status (for maps that use ModelVersionID as key) -type ModelLoadStatus struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - ModelVersionId *ModelVersionID `protobuf:"bytes,1,opt,name=model_version_id,json=modelVersionId,proto3" json:"model_version_id,omitempty"` - Loaded bool `protobuf:"varint,2,opt,name=loaded,proto3" json:"loaded,omitempty"` -} - -func (x *ModelLoadStatus) Reset() { - *x = ModelLoadStatus{} - if protoimpl.UnsafeEnabled { - mi := &file_mlops_scheduler_db_db_proto_msgTypes[7] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *ModelLoadStatus) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*ModelLoadStatus) ProtoMessage() {} - -func (x *ModelLoadStatus) ProtoReflect() protoreflect.Message { - mi := &file_mlops_scheduler_db_db_proto_msgTypes[7] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use ModelLoadStatus.ProtoReflect.Descriptor instead. -func (*ModelLoadStatus) Descriptor() ([]byte, []int) { - return file_mlops_scheduler_db_db_proto_rawDescGZIP(), []int{7} -} - -func (x *ModelLoadStatus) GetModelVersionId() *ModelVersionID { - if x != nil { - return x.ModelVersionId - } - return nil -} - -func (x *ModelLoadStatus) GetLoaded() bool { - if x != nil { - return x.Loaded - } - return false -} - // ServerReplica represents a single replica of a server type ServerReplica struct { state protoimpl.MessageState @@ -777,12 +784,12 @@ type ServerReplica struct { Memory uint64 `protobuf:"varint,7,opt,name=memory,proto3" json:"memory,omitempty"` AvailableMemory uint64 `protobuf:"varint,8,opt,name=available_memory,json=availableMemory,proto3" json:"available_memory,omitempty"` // Using repeated messages since proto doesn't support complex map keys - LoadedModels []*ModelLoadStatus `protobuf:"bytes,9,rep,name=loaded_models,json=loadedModels,proto3" json:"loaded_models,omitempty"` - LoadingModels []*ModelLoadStatus `protobuf:"bytes,10,rep,name=loading_models,json=loadingModels,proto3" json:"loading_models,omitempty"` - OverCommitPercentage uint32 `protobuf:"varint,11,opt,name=over_commit_percentage,json=overCommitPercentage,proto3" json:"over_commit_percentage,omitempty"` - ReservedMemory uint64 `protobuf:"varint,12,opt,name=reserved_memory,json=reservedMemory,proto3" json:"reserved_memory,omitempty"` - UniqueLoadedModels map[string]bool `protobuf:"bytes,13,rep,name=unique_loaded_models,json=uniqueLoadedModels,proto3" json:"unique_loaded_models,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"varint,2,opt,name=value,proto3"` - IsDraining bool `protobuf:"varint,14,opt,name=is_draining,json=isDraining,proto3" json:"is_draining,omitempty"` + LoadedModels []*ModelVersionID `protobuf:"bytes,9,rep,name=loaded_models,json=loadedModels,proto3" json:"loaded_models,omitempty"` + LoadingModels []*ModelVersionID `protobuf:"bytes,10,rep,name=loading_models,json=loadingModels,proto3" json:"loading_models,omitempty"` + OverCommitPercentage uint32 `protobuf:"varint,11,opt,name=over_commit_percentage,json=overCommitPercentage,proto3" json:"over_commit_percentage,omitempty"` + ReservedMemory uint64 `protobuf:"varint,12,opt,name=reserved_memory,json=reservedMemory,proto3" json:"reserved_memory,omitempty"` + UniqueLoadedModels map[string]bool `protobuf:"bytes,13,rep,name=unique_loaded_models,json=uniqueLoadedModels,proto3" json:"unique_loaded_models,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"varint,2,opt,name=value,proto3"` + IsDraining bool `protobuf:"varint,14,opt,name=is_draining,json=isDraining,proto3" json:"is_draining,omitempty"` } func (x *ServerReplica) Reset() { @@ -873,14 +880,14 @@ func (x *ServerReplica) GetAvailableMemory() uint64 { return 0 } -func (x *ServerReplica) GetLoadedModels() []*ModelLoadStatus { +func (x *ServerReplica) GetLoadedModels() []*ModelVersionID { if x != nil { return x.LoadedModels } return nil } -func (x *ServerReplica) GetLoadingModels() []*ModelLoadStatus { +func (x *ServerReplica) GetLoadingModels() []*ModelVersionID { if x != nil { return x.LoadingModels } @@ -925,7 +932,25 @@ var file_mlops_scheduler_db_db_proto_rawDesc = []byte{ 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, 0x66, 0x2f, 0x74, 0x69, 0x6d, 0x65, 0x73, 0x74, 0x61, 0x6d, 0x70, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x1a, 0x1f, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2f, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2f, 0x73, 0x63, 0x68, 0x65, 0x64, - 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x22, 0xa5, 0x01, 0x0a, 0x0d, 0x52, + 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x22, 0xa9, 0x01, 0x0a, 0x10, 0x50, + 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x53, 0x6e, 0x61, 0x70, 0x73, 0x68, 0x6f, 0x74, 0x12, + 0x12, 0x0a, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x6e, + 0x61, 0x6d, 0x65, 0x12, 0x20, 0x0a, 0x0b, 0x6c, 0x61, 0x73, 0x74, 0x56, 0x65, 0x72, 0x73, 0x69, + 0x6f, 0x6e, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x0b, 0x6c, 0x61, 0x73, 0x74, 0x56, 0x65, + 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x12, 0x45, 0x0a, 0x08, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, + 0x73, 0x18, 0x03, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x29, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, + 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, + 0x2e, 0x50, 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x57, 0x69, 0x74, 0x68, 0x53, 0x74, 0x61, + 0x74, 0x65, 0x52, 0x08, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x73, 0x12, 0x18, 0x0a, 0x07, + 0x64, 0x65, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x08, 0x52, 0x07, 0x64, + 0x65, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x22, 0x72, 0x0a, 0x12, 0x45, 0x78, 0x70, 0x65, 0x72, 0x69, + 0x6d, 0x65, 0x6e, 0x74, 0x53, 0x6e, 0x61, 0x70, 0x73, 0x68, 0x6f, 0x74, 0x12, 0x42, 0x0a, 0x0a, + 0x65, 0x78, 0x70, 0x65, 0x72, 0x69, 0x6d, 0x65, 0x6e, 0x74, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, + 0x32, 0x22, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, + 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x45, 0x78, 0x70, 0x65, 0x72, 0x69, + 0x6d, 0x65, 0x6e, 0x74, 0x52, 0x0a, 0x65, 0x78, 0x70, 0x65, 0x72, 0x69, 0x6d, 0x65, 0x6e, 0x74, + 0x12, 0x18, 0x0a, 0x07, 0x64, 0x65, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, + 0x08, 0x52, 0x07, 0x64, 0x65, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x22, 0xa5, 0x01, 0x0a, 0x0d, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x42, 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0e, 0x32, 0x2c, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, @@ -962,107 +987,98 @@ var file_mlops_scheduler_db_db_proto_rawDesc = []byte{ 0x69, 0x6d, 0x65, 0x73, 0x74, 0x61, 0x6d, 0x70, 0x18, 0x08, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1a, 0x2e, 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, 0x66, 0x2e, 0x54, 0x69, 0x6d, 0x65, 0x73, 0x74, 0x61, 0x6d, 0x70, 0x52, 0x09, 0x74, 0x69, 0x6d, 0x65, - 0x73, 0x74, 0x61, 0x6d, 0x70, 0x22, 0x7b, 0x0a, 0x12, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, - 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x12, 0x23, 0x0a, 0x0d, 0x72, - 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x5f, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x18, 0x01, 0x20, 0x01, - 0x28, 0x05, 0x52, 0x0c, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x49, 0x6e, 0x64, 0x65, 0x78, - 0x12, 0x40, 0x0a, 0x06, 0x73, 0x74, 0x61, 0x74, 0x75, 0x73, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, - 0x32, 0x28, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, - 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, 0x62, 0x2e, 0x52, 0x65, 0x70, - 0x6c, 0x69, 0x63, 0x61, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x06, 0x73, 0x74, 0x61, 0x74, - 0x75, 0x73, 0x22, 0x87, 0x02, 0x0a, 0x0c, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x56, 0x65, 0x72, 0x73, - 0x69, 0x6f, 0x6e, 0x12, 0x3c, 0x0a, 0x0a, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x5f, 0x64, 0x65, 0x66, - 0x6e, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1d, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, - 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, - 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x52, 0x09, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x44, 0x65, 0x66, - 0x6e, 0x12, 0x18, 0x0a, 0x07, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x18, 0x02, 0x20, 0x01, - 0x28, 0x0d, 0x52, 0x07, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x12, 0x16, 0x0a, 0x06, 0x73, - 0x65, 0x72, 0x76, 0x65, 0x72, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, 0x73, 0x65, 0x72, - 0x76, 0x65, 0x72, 0x12, 0x49, 0x0a, 0x08, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x18, - 0x04, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x2d, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, - 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, - 0x62, 0x2e, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x45, - 0x6e, 0x74, 0x72, 0x79, 0x52, 0x08, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x12, 0x3c, - 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x05, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x26, 0x2e, + 0x73, 0x74, 0x61, 0x6d, 0x70, 0x22, 0xf6, 0x02, 0x0a, 0x0c, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x56, + 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x12, 0x3c, 0x0a, 0x0a, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x5f, + 0x64, 0x65, 0x66, 0x6e, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x1d, 0x2e, 0x73, 0x65, 0x6c, + 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, + 0x6c, 0x65, 0x72, 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x52, 0x09, 0x6d, 0x6f, 0x64, 0x65, 0x6c, + 0x44, 0x65, 0x66, 0x6e, 0x12, 0x18, 0x0a, 0x07, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x18, + 0x02, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x07, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x12, 0x16, + 0x0a, 0x06, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x06, + 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x12, 0x51, 0x0a, 0x08, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, + 0x61, 0x73, 0x18, 0x04, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x35, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, + 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, + 0x72, 0x2e, 0x64, 0x62, 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x56, 0x65, 0x72, 0x73, 0x69, 0x6f, + 0x6e, 0x2e, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, + 0x08, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x12, 0x3c, 0x0a, 0x05, 0x73, 0x74, 0x61, + 0x74, 0x65, 0x18, 0x05, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x26, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, + 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, + 0x72, 0x2e, 0x64, 0x62, 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, + 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x1a, 0x65, 0x0a, 0x0d, 0x52, 0x65, 0x70, 0x6c, 0x69, + 0x63, 0x61, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x12, 0x10, 0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, + 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x03, 0x6b, 0x65, 0x79, 0x12, 0x3e, 0x0a, 0x05, 0x76, 0x61, + 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x28, 0x2e, 0x73, 0x65, 0x6c, 0x64, + 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, + 0x65, 0x72, 0x2e, 0x64, 0x62, 0x2e, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x53, 0x74, 0x61, + 0x74, 0x75, 0x73, 0x52, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x22, 0x3e, + 0x0a, 0x0e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x56, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x49, 0x44, + 0x12, 0x12, 0x0a, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, + 0x6e, 0x61, 0x6d, 0x65, 0x12, 0x18, 0x0a, 0x07, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x18, + 0x02, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x07, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x22, 0x7a, + 0x0a, 0x05, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x12, 0x12, 0x0a, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x18, + 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x12, 0x43, 0x0a, 0x08, 0x76, + 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x27, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, - 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, 0x62, 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x53, - 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x22, 0x3e, 0x0a, 0x0e, - 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x56, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x49, 0x44, 0x12, 0x12, - 0x0a, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x6e, 0x61, - 0x6d, 0x65, 0x12, 0x18, 0x0a, 0x07, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x18, 0x02, 0x20, - 0x01, 0x28, 0x0d, 0x52, 0x07, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x22, 0x7a, 0x0a, 0x05, - 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x12, 0x12, 0x0a, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, - 0x01, 0x28, 0x09, 0x52, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x12, 0x43, 0x0a, 0x08, 0x76, 0x65, 0x72, - 0x73, 0x69, 0x6f, 0x6e, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x27, 0x2e, 0x73, 0x65, + 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, 0x62, 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x56, + 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x52, 0x08, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x73, + 0x12, 0x18, 0x0a, 0x07, 0x64, 0x65, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x18, 0x03, 0x20, 0x01, 0x28, + 0x08, 0x52, 0x07, 0x64, 0x65, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x22, 0xac, 0x03, 0x0a, 0x06, 0x53, + 0x65, 0x72, 0x76, 0x65, 0x72, 0x12, 0x12, 0x0a, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, + 0x01, 0x28, 0x09, 0x52, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x12, 0x4b, 0x0a, 0x08, 0x72, 0x65, 0x70, + 0x6c, 0x69, 0x63, 0x61, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x2f, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, - 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, 0x62, 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x56, 0x65, 0x72, - 0x73, 0x69, 0x6f, 0x6e, 0x52, 0x08, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x73, 0x12, 0x18, - 0x0a, 0x07, 0x64, 0x65, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x18, 0x03, 0x20, 0x01, 0x28, 0x08, 0x52, - 0x07, 0x64, 0x65, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x22, 0xac, 0x03, 0x0a, 0x06, 0x53, 0x65, 0x72, - 0x76, 0x65, 0x72, 0x12, 0x12, 0x0a, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, - 0x09, 0x52, 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x12, 0x4b, 0x0a, 0x08, 0x72, 0x65, 0x70, 0x6c, 0x69, - 0x63, 0x61, 0x73, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x2f, 0x2e, 0x73, 0x65, 0x6c, 0x64, - 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, - 0x65, 0x72, 0x2e, 0x64, 0x62, 0x2e, 0x53, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x52, 0x65, 0x70, - 0x6c, 0x69, 0x63, 0x61, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, 0x08, 0x72, 0x65, 0x70, 0x6c, - 0x69, 0x63, 0x61, 0x73, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x18, 0x03, - 0x20, 0x01, 0x28, 0x08, 0x52, 0x06, 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x12, 0x2b, 0x0a, 0x11, - 0x65, 0x78, 0x70, 0x65, 0x63, 0x74, 0x65, 0x64, 0x5f, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, - 0x73, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 0x52, 0x10, 0x65, 0x78, 0x70, 0x65, 0x63, 0x74, 0x65, - 0x64, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x12, 0x21, 0x0a, 0x0c, 0x6d, 0x69, 0x6e, - 0x5f, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x18, 0x05, 0x20, 0x01, 0x28, 0x05, 0x52, - 0x0b, 0x6d, 0x69, 0x6e, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x12, 0x21, 0x0a, 0x0c, - 0x6d, 0x61, 0x78, 0x5f, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x18, 0x06, 0x20, 0x01, - 0x28, 0x05, 0x52, 0x0b, 0x6d, 0x61, 0x78, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x12, - 0x4f, 0x0a, 0x0f, 0x6b, 0x75, 0x62, 0x65, 0x72, 0x6e, 0x65, 0x74, 0x65, 0x73, 0x5f, 0x6d, 0x65, - 0x74, 0x61, 0x18, 0x07, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x26, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, - 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, - 0x72, 0x2e, 0x4b, 0x75, 0x62, 0x65, 0x72, 0x6e, 0x65, 0x74, 0x65, 0x73, 0x4d, 0x65, 0x74, 0x61, - 0x52, 0x0e, 0x6b, 0x75, 0x62, 0x65, 0x72, 0x6e, 0x65, 0x74, 0x65, 0x73, 0x4d, 0x65, 0x74, 0x61, - 0x1a, 0x65, 0x0a, 0x0d, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x45, 0x6e, 0x74, 0x72, - 0x79, 0x12, 0x10, 0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, 0x52, 0x03, - 0x6b, 0x65, 0x79, 0x12, 0x3e, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, - 0x28, 0x0b, 0x32, 0x28, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, - 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, 0x62, 0x2e, 0x53, - 0x65, 0x72, 0x76, 0x65, 0x72, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x52, 0x05, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x22, 0x7e, 0x0a, 0x0f, 0x4d, 0x6f, 0x64, 0x65, 0x6c, - 0x4c, 0x6f, 0x61, 0x64, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x12, 0x53, 0x0a, 0x10, 0x6d, 0x6f, - 0x64, 0x65, 0x6c, 0x5f, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x5f, 0x69, 0x64, 0x18, 0x01, - 0x20, 0x01, 0x28, 0x0b, 0x32, 0x29, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, + 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, 0x62, 0x2e, 0x53, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x52, + 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, 0x08, 0x72, 0x65, + 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, + 0x18, 0x03, 0x20, 0x01, 0x28, 0x08, 0x52, 0x06, 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x12, 0x2b, + 0x0a, 0x11, 0x65, 0x78, 0x70, 0x65, 0x63, 0x74, 0x65, 0x64, 0x5f, 0x72, 0x65, 0x70, 0x6c, 0x69, + 0x63, 0x61, 0x73, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 0x52, 0x10, 0x65, 0x78, 0x70, 0x65, 0x63, + 0x74, 0x65, 0x64, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x12, 0x21, 0x0a, 0x0c, 0x6d, + 0x69, 0x6e, 0x5f, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x18, 0x05, 0x20, 0x01, 0x28, + 0x05, 0x52, 0x0b, 0x6d, 0x69, 0x6e, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x12, 0x21, + 0x0a, 0x0c, 0x6d, 0x61, 0x78, 0x5f, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x18, 0x06, + 0x20, 0x01, 0x28, 0x05, 0x52, 0x0b, 0x6d, 0x61, 0x78, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, + 0x73, 0x12, 0x4f, 0x0a, 0x0f, 0x6b, 0x75, 0x62, 0x65, 0x72, 0x6e, 0x65, 0x74, 0x65, 0x73, 0x5f, + 0x6d, 0x65, 0x74, 0x61, 0x18, 0x07, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x26, 0x2e, 0x73, 0x65, 0x6c, + 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, + 0x6c, 0x65, 0x72, 0x2e, 0x4b, 0x75, 0x62, 0x65, 0x72, 0x6e, 0x65, 0x74, 0x65, 0x73, 0x4d, 0x65, + 0x74, 0x61, 0x52, 0x0e, 0x6b, 0x75, 0x62, 0x65, 0x72, 0x6e, 0x65, 0x74, 0x65, 0x73, 0x4d, 0x65, + 0x74, 0x61, 0x1a, 0x65, 0x0a, 0x0d, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x45, 0x6e, + 0x74, 0x72, 0x79, 0x12, 0x10, 0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x05, + 0x52, 0x03, 0x6b, 0x65, 0x79, 0x12, 0x3e, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, + 0x20, 0x01, 0x28, 0x0b, 0x32, 0x28, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, 0x62, - 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x56, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x49, 0x44, 0x52, - 0x0e, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x56, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x49, 0x64, 0x12, - 0x16, 0x0a, 0x06, 0x6c, 0x6f, 0x61, 0x64, 0x65, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x08, 0x52, - 0x06, 0x6c, 0x6f, 0x61, 0x64, 0x65, 0x64, 0x22, 0x9c, 0x06, 0x0a, 0x0d, 0x53, 0x65, 0x72, 0x76, - 0x65, 0x72, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x12, 0x23, 0x0a, 0x0d, 0x69, 0x6e, 0x66, - 0x65, 0x72, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x73, 0x76, 0x63, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, - 0x52, 0x0c, 0x69, 0x6e, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x63, 0x65, 0x53, 0x76, 0x63, 0x12, 0x2e, - 0x0a, 0x13, 0x69, 0x6e, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x68, 0x74, 0x74, 0x70, - 0x5f, 0x70, 0x6f, 0x72, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x05, 0x52, 0x11, 0x69, 0x6e, 0x66, - 0x65, 0x72, 0x65, 0x6e, 0x63, 0x65, 0x48, 0x74, 0x74, 0x70, 0x50, 0x6f, 0x72, 0x74, 0x12, 0x2e, - 0x0a, 0x13, 0x69, 0x6e, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x67, 0x72, 0x70, 0x63, - 0x5f, 0x70, 0x6f, 0x72, 0x74, 0x18, 0x03, 0x20, 0x01, 0x28, 0x05, 0x52, 0x11, 0x69, 0x6e, 0x66, - 0x65, 0x72, 0x65, 0x6e, 0x63, 0x65, 0x47, 0x72, 0x70, 0x63, 0x50, 0x6f, 0x72, 0x74, 0x12, 0x1f, - 0x0a, 0x0b, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x5f, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x04, 0x20, - 0x01, 0x28, 0x09, 0x52, 0x0a, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x4e, 0x61, 0x6d, 0x65, 0x12, - 0x1f, 0x0a, 0x0b, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x5f, 0x69, 0x64, 0x78, 0x18, 0x05, - 0x20, 0x01, 0x28, 0x05, 0x52, 0x0a, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x49, 0x64, 0x78, - 0x12, 0x22, 0x0a, 0x0c, 0x63, 0x61, 0x70, 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, - 0x18, 0x06, 0x20, 0x03, 0x28, 0x09, 0x52, 0x0c, 0x63, 0x61, 0x70, 0x61, 0x62, 0x69, 0x6c, 0x69, - 0x74, 0x69, 0x65, 0x73, 0x12, 0x16, 0x0a, 0x06, 0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x18, 0x07, - 0x20, 0x01, 0x28, 0x04, 0x52, 0x06, 0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x12, 0x29, 0x0a, 0x10, - 0x61, 0x76, 0x61, 0x69, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79, - 0x18, 0x08, 0x20, 0x01, 0x28, 0x04, 0x52, 0x0f, 0x61, 0x76, 0x61, 0x69, 0x6c, 0x61, 0x62, 0x6c, - 0x65, 0x4d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x12, 0x4f, 0x0a, 0x0d, 0x6c, 0x6f, 0x61, 0x64, 0x65, - 0x64, 0x5f, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x73, 0x18, 0x09, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x2a, - 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, - 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, 0x62, 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, - 0x4c, 0x6f, 0x61, 0x64, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x0c, 0x6c, 0x6f, 0x61, 0x64, - 0x65, 0x64, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x73, 0x12, 0x51, 0x0a, 0x0e, 0x6c, 0x6f, 0x61, 0x64, - 0x69, 0x6e, 0x67, 0x5f, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x73, 0x18, 0x0a, 0x20, 0x03, 0x28, 0x0b, - 0x32, 0x2a, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, + 0x2e, 0x53, 0x65, 0x72, 0x76, 0x65, 0x72, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x52, 0x05, + 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x22, 0x9a, 0x06, 0x0a, 0x0d, 0x53, 0x65, + 0x72, 0x76, 0x65, 0x72, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x12, 0x23, 0x0a, 0x0d, 0x69, + 0x6e, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x73, 0x76, 0x63, 0x18, 0x01, 0x20, 0x01, + 0x28, 0x09, 0x52, 0x0c, 0x69, 0x6e, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x63, 0x65, 0x53, 0x76, 0x63, + 0x12, 0x2e, 0x0a, 0x13, 0x69, 0x6e, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x68, 0x74, + 0x74, 0x70, 0x5f, 0x70, 0x6f, 0x72, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x05, 0x52, 0x11, 0x69, + 0x6e, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x63, 0x65, 0x48, 0x74, 0x74, 0x70, 0x50, 0x6f, 0x72, 0x74, + 0x12, 0x2e, 0x0a, 0x13, 0x69, 0x6e, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x67, 0x72, + 0x70, 0x63, 0x5f, 0x70, 0x6f, 0x72, 0x74, 0x18, 0x03, 0x20, 0x01, 0x28, 0x05, 0x52, 0x11, 0x69, + 0x6e, 0x66, 0x65, 0x72, 0x65, 0x6e, 0x63, 0x65, 0x47, 0x72, 0x70, 0x63, 0x50, 0x6f, 0x72, 0x74, + 0x12, 0x1f, 0x0a, 0x0b, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x5f, 0x6e, 0x61, 0x6d, 0x65, 0x18, + 0x04, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0a, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x4e, 0x61, 0x6d, + 0x65, 0x12, 0x1f, 0x0a, 0x0b, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x5f, 0x69, 0x64, 0x78, + 0x18, 0x05, 0x20, 0x01, 0x28, 0x05, 0x52, 0x0a, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x49, + 0x64, 0x78, 0x12, 0x22, 0x0a, 0x0c, 0x63, 0x61, 0x70, 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, + 0x65, 0x73, 0x18, 0x06, 0x20, 0x03, 0x28, 0x09, 0x52, 0x0c, 0x63, 0x61, 0x70, 0x61, 0x62, 0x69, + 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x12, 0x16, 0x0a, 0x06, 0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79, + 0x18, 0x07, 0x20, 0x01, 0x28, 0x04, 0x52, 0x06, 0x6d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x12, 0x29, + 0x0a, 0x10, 0x61, 0x76, 0x61, 0x69, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x5f, 0x6d, 0x65, 0x6d, 0x6f, + 0x72, 0x79, 0x18, 0x08, 0x20, 0x01, 0x28, 0x04, 0x52, 0x0f, 0x61, 0x76, 0x61, 0x69, 0x6c, 0x61, + 0x62, 0x6c, 0x65, 0x4d, 0x65, 0x6d, 0x6f, 0x72, 0x79, 0x12, 0x4e, 0x0a, 0x0d, 0x6c, 0x6f, 0x61, + 0x64, 0x65, 0x64, 0x5f, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x73, 0x18, 0x09, 0x20, 0x03, 0x28, 0x0b, + 0x32, 0x29, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, 0x62, 0x2e, 0x4d, 0x6f, 0x64, - 0x65, 0x6c, 0x4c, 0x6f, 0x61, 0x64, 0x53, 0x74, 0x61, 0x74, 0x75, 0x73, 0x52, 0x0d, 0x6c, 0x6f, + 0x65, 0x6c, 0x56, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x49, 0x44, 0x52, 0x0c, 0x6c, 0x6f, 0x61, + 0x64, 0x65, 0x64, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x73, 0x12, 0x50, 0x0a, 0x0e, 0x6c, 0x6f, 0x61, + 0x64, 0x69, 0x6e, 0x67, 0x5f, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x73, 0x18, 0x0a, 0x20, 0x03, 0x28, + 0x0b, 0x32, 0x29, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, + 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, 0x62, 0x2e, 0x4d, 0x6f, + 0x64, 0x65, 0x6c, 0x56, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x49, 0x44, 0x52, 0x0d, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x73, 0x12, 0x34, 0x0a, 0x16, 0x6f, 0x76, 0x65, 0x72, 0x5f, 0x63, 0x6f, 0x6d, 0x6d, 0x69, 0x74, 0x5f, 0x70, 0x65, 0x72, 0x63, 0x65, 0x6e, 0x74, 0x61, 0x67, 0x65, 0x18, 0x0b, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x14, 0x6f, 0x76, 0x65, @@ -1155,48 +1171,52 @@ func file_mlops_scheduler_db_db_proto_rawDescGZIP() []byte { } var file_mlops_scheduler_db_db_proto_enumTypes = make([]protoimpl.EnumInfo, 2) -var file_mlops_scheduler_db_db_proto_msgTypes = make([]protoimpl.MessageInfo, 11) +var file_mlops_scheduler_db_db_proto_msgTypes = make([]protoimpl.MessageInfo, 12) var file_mlops_scheduler_db_db_proto_goTypes = []any{ - (ModelState)(0), // 0: seldon.mlops.scheduler.db.ModelState - (ModelReplicaState)(0), // 1: seldon.mlops.scheduler.db.ModelReplicaState - (*ReplicaStatus)(nil), // 2: seldon.mlops.scheduler.db.ReplicaStatus - (*ModelStatus)(nil), // 3: seldon.mlops.scheduler.db.ModelStatus - (*ReplicaStatusEntry)(nil), // 4: seldon.mlops.scheduler.db.ReplicaStatusEntry - (*ModelVersion)(nil), // 5: seldon.mlops.scheduler.db.ModelVersion - (*ModelVersionID)(nil), // 6: seldon.mlops.scheduler.db.ModelVersionID - (*Model)(nil), // 7: seldon.mlops.scheduler.db.Model - (*Server)(nil), // 8: seldon.mlops.scheduler.db.Server - (*ModelLoadStatus)(nil), // 9: seldon.mlops.scheduler.db.ModelLoadStatus - (*ServerReplica)(nil), // 10: seldon.mlops.scheduler.db.ServerReplica - nil, // 11: seldon.mlops.scheduler.db.Server.ReplicasEntry - nil, // 12: seldon.mlops.scheduler.db.ServerReplica.UniqueLoadedModelsEntry - (*timestamppb.Timestamp)(nil), // 13: google.protobuf.Timestamp - (*scheduler.Model)(nil), // 14: seldon.mlops.scheduler.Model - (*scheduler.KubernetesMeta)(nil), // 15: seldon.mlops.scheduler.KubernetesMeta + (ModelState)(0), // 0: seldon.mlops.scheduler.db.ModelState + (ModelReplicaState)(0), // 1: seldon.mlops.scheduler.db.ModelReplicaState + (*PipelineSnapshot)(nil), // 2: seldon.mlops.scheduler.db.PipelineSnapshot + (*ExperimentSnapshot)(nil), // 3: seldon.mlops.scheduler.db.ExperimentSnapshot + (*ReplicaStatus)(nil), // 4: seldon.mlops.scheduler.db.ReplicaStatus + (*ModelStatus)(nil), // 5: seldon.mlops.scheduler.db.ModelStatus + (*ModelVersion)(nil), // 6: seldon.mlops.scheduler.db.ModelVersion + (*ModelVersionID)(nil), // 7: seldon.mlops.scheduler.db.ModelVersionID + (*Model)(nil), // 8: seldon.mlops.scheduler.db.Model + (*Server)(nil), // 9: seldon.mlops.scheduler.db.Server + (*ServerReplica)(nil), // 10: seldon.mlops.scheduler.db.ServerReplica + nil, // 11: seldon.mlops.scheduler.db.ModelVersion.ReplicasEntry + nil, // 12: seldon.mlops.scheduler.db.Server.ReplicasEntry + nil, // 13: seldon.mlops.scheduler.db.ServerReplica.UniqueLoadedModelsEntry + (*scheduler.PipelineWithState)(nil), // 14: seldon.mlops.scheduler.PipelineWithState + (*scheduler.Experiment)(nil), // 15: seldon.mlops.scheduler.Experiment + (*timestamppb.Timestamp)(nil), // 16: google.protobuf.Timestamp + (*scheduler.Model)(nil), // 17: seldon.mlops.scheduler.Model + (*scheduler.KubernetesMeta)(nil), // 18: seldon.mlops.scheduler.KubernetesMeta } var file_mlops_scheduler_db_db_proto_depIdxs = []int32{ - 1, // 0: seldon.mlops.scheduler.db.ReplicaStatus.state:type_name -> seldon.mlops.scheduler.db.ModelReplicaState - 13, // 1: seldon.mlops.scheduler.db.ReplicaStatus.timestamp:type_name -> google.protobuf.Timestamp - 0, // 2: seldon.mlops.scheduler.db.ModelStatus.state:type_name -> seldon.mlops.scheduler.db.ModelState - 0, // 3: seldon.mlops.scheduler.db.ModelStatus.model_gw_state:type_name -> seldon.mlops.scheduler.db.ModelState - 13, // 4: seldon.mlops.scheduler.db.ModelStatus.timestamp:type_name -> google.protobuf.Timestamp - 2, // 5: seldon.mlops.scheduler.db.ReplicaStatusEntry.status:type_name -> seldon.mlops.scheduler.db.ReplicaStatus - 14, // 6: seldon.mlops.scheduler.db.ModelVersion.model_defn:type_name -> seldon.mlops.scheduler.Model - 4, // 7: seldon.mlops.scheduler.db.ModelVersion.replicas:type_name -> seldon.mlops.scheduler.db.ReplicaStatusEntry - 3, // 8: seldon.mlops.scheduler.db.ModelVersion.state:type_name -> seldon.mlops.scheduler.db.ModelStatus - 5, // 9: seldon.mlops.scheduler.db.Model.versions:type_name -> seldon.mlops.scheduler.db.ModelVersion - 11, // 10: seldon.mlops.scheduler.db.Server.replicas:type_name -> seldon.mlops.scheduler.db.Server.ReplicasEntry - 15, // 11: seldon.mlops.scheduler.db.Server.kubernetes_meta:type_name -> seldon.mlops.scheduler.KubernetesMeta - 6, // 12: seldon.mlops.scheduler.db.ModelLoadStatus.model_version_id:type_name -> seldon.mlops.scheduler.db.ModelVersionID - 9, // 13: seldon.mlops.scheduler.db.ServerReplica.loaded_models:type_name -> seldon.mlops.scheduler.db.ModelLoadStatus - 9, // 14: seldon.mlops.scheduler.db.ServerReplica.loading_models:type_name -> seldon.mlops.scheduler.db.ModelLoadStatus - 12, // 15: seldon.mlops.scheduler.db.ServerReplica.unique_loaded_models:type_name -> seldon.mlops.scheduler.db.ServerReplica.UniqueLoadedModelsEntry - 10, // 16: seldon.mlops.scheduler.db.Server.ReplicasEntry.value:type_name -> seldon.mlops.scheduler.db.ServerReplica - 17, // [17:17] is the sub-list for method output_type - 17, // [17:17] is the sub-list for method input_type - 17, // [17:17] is the sub-list for extension type_name - 17, // [17:17] is the sub-list for extension extendee - 0, // [0:17] is the sub-list for field type_name + 14, // 0: seldon.mlops.scheduler.db.PipelineSnapshot.versions:type_name -> seldon.mlops.scheduler.PipelineWithState + 15, // 1: seldon.mlops.scheduler.db.ExperimentSnapshot.experiment:type_name -> seldon.mlops.scheduler.Experiment + 1, // 2: seldon.mlops.scheduler.db.ReplicaStatus.state:type_name -> seldon.mlops.scheduler.db.ModelReplicaState + 16, // 3: seldon.mlops.scheduler.db.ReplicaStatus.timestamp:type_name -> google.protobuf.Timestamp + 0, // 4: seldon.mlops.scheduler.db.ModelStatus.state:type_name -> seldon.mlops.scheduler.db.ModelState + 0, // 5: seldon.mlops.scheduler.db.ModelStatus.model_gw_state:type_name -> seldon.mlops.scheduler.db.ModelState + 16, // 6: seldon.mlops.scheduler.db.ModelStatus.timestamp:type_name -> google.protobuf.Timestamp + 17, // 7: seldon.mlops.scheduler.db.ModelVersion.model_defn:type_name -> seldon.mlops.scheduler.Model + 11, // 8: seldon.mlops.scheduler.db.ModelVersion.replicas:type_name -> seldon.mlops.scheduler.db.ModelVersion.ReplicasEntry + 5, // 9: seldon.mlops.scheduler.db.ModelVersion.state:type_name -> seldon.mlops.scheduler.db.ModelStatus + 6, // 10: seldon.mlops.scheduler.db.Model.versions:type_name -> seldon.mlops.scheduler.db.ModelVersion + 12, // 11: seldon.mlops.scheduler.db.Server.replicas:type_name -> seldon.mlops.scheduler.db.Server.ReplicasEntry + 18, // 12: seldon.mlops.scheduler.db.Server.kubernetes_meta:type_name -> seldon.mlops.scheduler.KubernetesMeta + 7, // 13: seldon.mlops.scheduler.db.ServerReplica.loaded_models:type_name -> seldon.mlops.scheduler.db.ModelVersionID + 7, // 14: seldon.mlops.scheduler.db.ServerReplica.loading_models:type_name -> seldon.mlops.scheduler.db.ModelVersionID + 13, // 15: seldon.mlops.scheduler.db.ServerReplica.unique_loaded_models:type_name -> seldon.mlops.scheduler.db.ServerReplica.UniqueLoadedModelsEntry + 4, // 16: seldon.mlops.scheduler.db.ModelVersion.ReplicasEntry.value:type_name -> seldon.mlops.scheduler.db.ReplicaStatus + 10, // 17: seldon.mlops.scheduler.db.Server.ReplicasEntry.value:type_name -> seldon.mlops.scheduler.db.ServerReplica + 18, // [18:18] is the sub-list for method output_type + 18, // [18:18] is the sub-list for method input_type + 18, // [18:18] is the sub-list for extension type_name + 18, // [18:18] is the sub-list for extension extendee + 0, // [0:18] is the sub-list for field type_name } func init() { file_mlops_scheduler_db_db_proto_init() } @@ -1206,7 +1226,7 @@ func file_mlops_scheduler_db_db_proto_init() { } if !protoimpl.UnsafeEnabled { file_mlops_scheduler_db_db_proto_msgTypes[0].Exporter = func(v any, i int) any { - switch v := v.(*ReplicaStatus); i { + switch v := v.(*PipelineSnapshot); i { case 0: return &v.state case 1: @@ -1218,7 +1238,7 @@ func file_mlops_scheduler_db_db_proto_init() { } } file_mlops_scheduler_db_db_proto_msgTypes[1].Exporter = func(v any, i int) any { - switch v := v.(*ModelStatus); i { + switch v := v.(*ExperimentSnapshot); i { case 0: return &v.state case 1: @@ -1230,7 +1250,7 @@ func file_mlops_scheduler_db_db_proto_init() { } } file_mlops_scheduler_db_db_proto_msgTypes[2].Exporter = func(v any, i int) any { - switch v := v.(*ReplicaStatusEntry); i { + switch v := v.(*ReplicaStatus); i { case 0: return &v.state case 1: @@ -1242,7 +1262,7 @@ func file_mlops_scheduler_db_db_proto_init() { } } file_mlops_scheduler_db_db_proto_msgTypes[3].Exporter = func(v any, i int) any { - switch v := v.(*ModelVersion); i { + switch v := v.(*ModelStatus); i { case 0: return &v.state case 1: @@ -1254,7 +1274,7 @@ func file_mlops_scheduler_db_db_proto_init() { } } file_mlops_scheduler_db_db_proto_msgTypes[4].Exporter = func(v any, i int) any { - switch v := v.(*ModelVersionID); i { + switch v := v.(*ModelVersion); i { case 0: return &v.state case 1: @@ -1266,7 +1286,7 @@ func file_mlops_scheduler_db_db_proto_init() { } } file_mlops_scheduler_db_db_proto_msgTypes[5].Exporter = func(v any, i int) any { - switch v := v.(*Model); i { + switch v := v.(*ModelVersionID); i { case 0: return &v.state case 1: @@ -1278,7 +1298,7 @@ func file_mlops_scheduler_db_db_proto_init() { } } file_mlops_scheduler_db_db_proto_msgTypes[6].Exporter = func(v any, i int) any { - switch v := v.(*Server); i { + switch v := v.(*Model); i { case 0: return &v.state case 1: @@ -1290,7 +1310,7 @@ func file_mlops_scheduler_db_db_proto_init() { } } file_mlops_scheduler_db_db_proto_msgTypes[7].Exporter = func(v any, i int) any { - switch v := v.(*ModelLoadStatus); i { + switch v := v.(*Server); i { case 0: return &v.state case 1: @@ -1320,7 +1340,7 @@ func file_mlops_scheduler_db_db_proto_init() { GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: file_mlops_scheduler_db_db_proto_rawDesc, NumEnums: 2, - NumMessages: 11, + NumMessages: 12, NumExtensions: 0, NumServices: 0, }, diff --git a/apis/go/mlops/scheduler/db/db_ext.go b/apis/go/mlops/scheduler/db/db_ext.go deleted file mode 100644 index d4cdbe9425..0000000000 --- a/apis/go/mlops/scheduler/db/db_ext.go +++ /dev/null @@ -1,121 +0,0 @@ -package db - -import ( - "time" - - "google.golang.org/protobuf/types/known/timestamppb" -) - -func (m *ModelVersion) GetAssignment() []int { - var assignment []int - var draining []int - - for k, v := range m.Replicas { - if v.Status.State == ModelReplicaState_MODEL_REPLICA_STATE_LOADED || - v.Status.State == ModelReplicaState_MODEL_REPLICA_STATE_AVAILABLE || - v.Status.State == ModelReplicaState_MODEL_REPLICA_STATE_LOADED_UNAVAILABLE { - assignment = append(assignment, k) - } - if v.Status.State == ModelReplicaState_MODEL_REPLICA_STATE_DRAINING { - draining = append(draining, k) - } - } - - // prefer assignments that are not draining as envoy is eventual consistent - if len(assignment) > 0 { - return assignment - } - if len(draining) > 0 { - return draining - } - return nil -} - -func (m *ModelVersion) SetReplicaState(replicaIdx int, state ModelReplicaState, reason string) { - m.Replicas[replicaIdx] = &ReplicaStatusEntry{State: state, Timestamp: timestamppb.Now(), Reason: reason} -} - -func (m *Model) Latest() *ModelVersion { - if len(m.Versions) > 0 { - return m.Versions[len(m.Versions)-1] - } - return nil -} - -func (m *Model) HasLatest() bool { - return len(m.Versions) > 0 -} - -func (m *Model) GetVersion(version uint32) *ModelVersion { - for _, mv := range m.Versions { - if mv.GetVersion() == version { - return mv - } - } - return nil -} - -// TODO do we need to consider previous versions? -func (m *Model) Inactive() bool { - return m.Latest().Inactive() -} - -func (m *ModelVersion) Inactive() bool { - for _, v := range m.Replicas { - if !v.Status.State.Inactive() { - return false - } - } - return true -} - -func (m ModelReplicaState) CanReceiveTraffic() bool { - return m == ModelReplicaState_MODEL_REPLICA_STATE_LOADED || - m == ModelReplicaState_MODEL_REPLICA_STATE_AVAILABLE || - m == ModelReplicaState_MODEL_REPLICA_STATE_LOADED_UNAVAILABLE || - m == ModelReplicaState_MODEL_REPLICA_STATE_DRAINING -} - -func (m ModelReplicaState) AlreadyLoadingOrLoaded() bool { - return m == ModelReplicaState_MODEL_REPLICA_STATE_LOADING || - m == ModelReplicaState_MODEL_REPLICA_STATE_LOADED || - m == ModelReplicaState_MODEL_REPLICA_STATE_AVAILABLE || - m == ModelReplicaState_MODEL_REPLICA_STATE_LOADED_UNAVAILABLE -} - -func (m ModelReplicaState) UnloadingOrUnloaded() bool { - return m == ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_ENVOY_REQUESTED || - m == ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_REQUESTED || - m == ModelReplicaState_MODEL_REPLICA_STATE_LOADING || - m == ModelReplicaState_MODEL_REPLICA_STATE_UNLOADED || - m == ModelReplicaState_MODEL_REPLICA_STATE_UNKNOWN -} - -func (m ModelReplicaState) Inactive() bool { - return m == ModelReplicaState_MODEL_REPLICA_STATE_UNLOADED || - m == ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_FAILED || - m == ModelReplicaState_MODEL_REPLICA_STATE_UNKNOWN || - m == ModelReplicaState_MODEL_REPLICA_STATE_LOAD_FAILED -} - -func (m ModelReplicaState) IsLoadingOrLoaded() bool { - return m == ModelReplicaState_MODEL_REPLICA_STATE_LOADED || - m == ModelReplicaState_MODEL_REPLICA_STATE_LOAD_REQUESTED || - m == ModelReplicaState_MODEL_REPLICA_STATE_LOADING || - m == ModelReplicaState_MODEL_REPLICA_STATE_AVAILABLE || m == ModelReplicaState_MODEL_REPLICA_STATE_LOADED_UNAVAILABLE -} - -// -// Servers -// - -func (s *ServerReplica) GetLoadedOrLoadingModelVersions() []*ModelVersionID { - var models []*ModelVersionID - for _, model := range s.LoadingModels { - models = append(models, model.ModelVersionId) - } - for _, model := range s.LoadingModels { - models = append(models, model.ModelVersionId) - } - return models -} diff --git a/apis/go/mlops/scheduler/db/model_ext.go b/apis/go/mlops/scheduler/db/model_ext.go new file mode 100644 index 0000000000..7baae1b47d --- /dev/null +++ b/apis/go/mlops/scheduler/db/model_ext.go @@ -0,0 +1,181 @@ +package db + +import ( + pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + "google.golang.org/protobuf/types/known/timestamppb" +) + +func (m *ModelVersion) GetAssignment() []int { + var assignment []int + var draining []int + + for k, v := range m.Replicas { + if v.State == ModelReplicaState_MODEL_REPLICA_STATE_LOADED || + v.State == ModelReplicaState_MODEL_REPLICA_STATE_AVAILABLE || + v.State == ModelReplicaState_MODEL_REPLICA_STATE_LOADED_UNAVAILABLE { + assignment = append(assignment, int(k)) + } + if v.State == ModelReplicaState_MODEL_REPLICA_STATE_DRAINING { + draining = append(draining, int(k)) + } + } + + // prefer assignments that are not draining as envoy is eventual consistent + if len(assignment) > 0 { + return assignment + } + if len(draining) > 0 { + return draining + } + return nil +} + +func (m *ModelVersion) DesiredReplicas() int { + return int(m.ModelDefn.DeploymentSpec.Replicas) +} + +func (m *ModelVersion) GetReplicaForState(state ModelReplicaState) []int { + var assignment []int + for k, v := range m.Replicas { + if v.State == state { + assignment = append(assignment, int(k)) + } + } + return assignment +} + +func (m *ModelVersion) HasServer() bool { + return m.Server != "" +} + +func (m *ModelVersion) GetRequiredMemory() uint64 { + var multiplier uint64 = 1 + if m.ModelDefn != nil && m.ModelDefn.ModelSpec != nil && + m.ModelDefn.ModelSpec.ModelRuntimeInfo != nil && + m.ModelDefn.ModelSpec.ModelRuntimeInfo.ModelRuntimeInfo != nil { + multiplier = getInstanceCount(m.ModelDefn.GetModelSpec().ModelRuntimeInfo) + } + return m.ModelDefn.GetModelSpec().GetMemoryBytes() * multiplier +} + +func getInstanceCount(modelRuntimeInfo *pb.ModelRuntimeInfo) uint64 { + switch modelRuntimeInfo.ModelRuntimeInfo.(type) { + case *pb.ModelRuntimeInfo_Mlserver: + return uint64(modelRuntimeInfo.GetMlserver().ParallelWorkers) + case *pb.ModelRuntimeInfo_Triton: + return uint64(modelRuntimeInfo.GetTriton().Cpu[0].InstanceCount) + default: + return 1 + } +} + +func (m *ModelVersion) SetReplicaState(replicaIdx int, state ModelReplicaState, reason string) { + m.Replicas[int32(replicaIdx)] = &ReplicaStatus{State: state, Timestamp: timestamppb.Now(), Reason: reason} +} + +func (m *ModelVersion) UpdateRuntimeInfo(runtimeInfo *pb.ModelRuntimeInfo) { + if m.ModelDefn.ModelSpec != nil && m.ModelDefn.ModelSpec.ModelRuntimeInfo == nil && runtimeInfo != nil { + m.ModelDefn.ModelSpec.ModelRuntimeInfo = runtimeInfo + } +} + +func (m *ModelVersion) GetModelReplicaState(replicaIdx int) ModelReplicaState { + state, ok := m.Replicas[int32(replicaIdx)] + if !ok { + return ModelReplicaState_MODEL_REPLICA_STATE_UNKNOWN + } + return state.State +} + +func (m *Model) Latest() *ModelVersion { + if len(m.Versions) > 0 { + return m.Versions[len(m.Versions)-1] + } + return nil +} + +func (m *Model) HasLatest() bool { + return len(m.Versions) > 0 +} + +func (m *Model) GetVersion(version uint32) *ModelVersion { + for _, mv := range m.Versions { + if mv.GetVersion() == version { + return mv + } + } + return nil +} + +// TODO do we need to consider previous versions? +func (m *Model) Inactive() bool { + return m.Latest().Inactive() +} + +func (m *Model) getLastAvailableModelVersionIdx() int { + lastAvailableIdx := -1 + for idx, mv := range m.Versions { + if mv.State.State == ModelState_MODEL_STATE_AVAILABLE { + lastAvailableIdx = idx + } + } + return lastAvailableIdx +} + +func (m *Model) GetLastAvailableModelVersion() *ModelVersion { + lastAvailableIdx := m.getLastAvailableModelVersionIdx() + if lastAvailableIdx != -1 { + return m.Versions[lastAvailableIdx] + } + return nil +} + +func (m *ModelVersion) Inactive() bool { + for _, v := range m.Replicas { + if !v.State.Inactive() { + return false + } + } + return true +} + +func (m *ModelVersion) DeleteReplica(replicaIdx int) { + delete(m.Replicas, int32(replicaIdx)) +} + +func (m ModelReplicaState) CanReceiveTraffic() bool { + return m == ModelReplicaState_MODEL_REPLICA_STATE_LOADED || + m == ModelReplicaState_MODEL_REPLICA_STATE_AVAILABLE || + m == ModelReplicaState_MODEL_REPLICA_STATE_LOADED_UNAVAILABLE || + m == ModelReplicaState_MODEL_REPLICA_STATE_DRAINING +} + +func (m ModelReplicaState) AlreadyLoadingOrLoaded() bool { + return m == ModelReplicaState_MODEL_REPLICA_STATE_LOADING || + m == ModelReplicaState_MODEL_REPLICA_STATE_LOADED || + m == ModelReplicaState_MODEL_REPLICA_STATE_AVAILABLE || + m == ModelReplicaState_MODEL_REPLICA_STATE_LOADED_UNAVAILABLE +} + +func (m ModelReplicaState) UnloadingOrUnloaded() bool { + return m == ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_ENVOY_REQUESTED || + m == ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_REQUESTED || + m == ModelReplicaState_MODEL_REPLICA_STATE_LOADING || + m == ModelReplicaState_MODEL_REPLICA_STATE_UNLOADED || + m == ModelReplicaState_MODEL_REPLICA_STATE_UNKNOWN +} + +func (m ModelReplicaState) Inactive() bool { + return m == ModelReplicaState_MODEL_REPLICA_STATE_UNLOADED || + m == ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_FAILED || + m == ModelReplicaState_MODEL_REPLICA_STATE_UNKNOWN || + m == ModelReplicaState_MODEL_REPLICA_STATE_LOAD_FAILED +} + +func (m ModelReplicaState) IsLoadingOrLoaded() bool { + return m == ModelReplicaState_MODEL_REPLICA_STATE_LOADED || + m == ModelReplicaState_MODEL_REPLICA_STATE_LOAD_REQUESTED || + m == ModelReplicaState_MODEL_REPLICA_STATE_LOADING || + m == ModelReplicaState_MODEL_REPLICA_STATE_AVAILABLE || + m == ModelReplicaState_MODEL_REPLICA_STATE_LOADED_UNAVAILABLE +} diff --git a/apis/go/mlops/scheduler/db/server_ext.go b/apis/go/mlops/scheduler/db/server_ext.go new file mode 100644 index 0000000000..58910f96f0 --- /dev/null +++ b/apis/go/mlops/scheduler/db/server_ext.go @@ -0,0 +1,83 @@ +package db + +import ( + "slices" +) + +func (s *ServerReplica) GetLoadedOrLoadingModelVersions() []*ModelVersionID { + var models []*ModelVersionID + models = append(models, s.LoadedModels...) + models = append(models, s.LoadingModels...) + return models +} + +func (s *ServerReplica) UpdateReservedMemory(memBytes uint64, isAdd bool) { + if isAdd { + s.ReservedMemory += memBytes + return + } + if memBytes > s.ReservedMemory { + s.ReservedMemory = 0 + return + } + s.ReservedMemory -= memBytes +} + +func (s *ServerReplica) AddModelVersion(modelName string, modelVersion uint32, replicaState ModelReplicaState) { + mvID := &ModelVersionID{ + Name: modelName, + Version: modelVersion, + } + + if replicaState == ModelReplicaState_MODEL_REPLICA_STATE_LOADING { + s.addToList(&s.LoadingModels, mvID) + return + } + + if replicaState == ModelReplicaState_MODEL_REPLICA_STATE_LOADED { + s.removeFromList(&s.LoadingModels, mvID) + s.addToList(&s.LoadedModels, mvID) + s.UniqueLoadedModels[modelName] = true + } +} + +func (s *ServerReplica) DeleteModelVersion(modelName string, modelVersion uint32) { + mvID := &ModelVersionID{ + Name: modelName, + Version: modelVersion, + } + + s.removeFromList(&s.LoadingModels, mvID) + s.removeFromList(&s.LoadedModels, mvID) + + if !s.modelExistsInList(s.LoadedModels, modelName) { + delete(s.UniqueLoadedModels, modelName) + } +} + +func (s *ServerReplica) addToList(list *[]*ModelVersionID, mvID *ModelVersionID) { + for _, model := range *list { + if model.Version == mvID.Version && model.Name == mvID.Name { + return + } + } + *list = append(*list, &ModelVersionID{ + Name: mvID.Name, + Version: mvID.Version, + }) +} + +func (s *ServerReplica) removeFromList(list *[]*ModelVersionID, mvID *ModelVersionID) { + *list = slices.DeleteFunc(*list, func(m *ModelVersionID) bool { + return m.Version == mvID.Version && m.Name == mvID.Name + }) +} + +func (s *ServerReplica) modelExistsInList(list []*ModelVersionID, modelName string) bool { + for _, model := range list { + if model.Name == modelName { + return true + } + } + return false +} diff --git a/apis/go/mlops/scheduler/scheduler.pb.go b/apis/go/mlops/scheduler/scheduler.pb.go index 1461692bc2..5e21a2dbb8 100644 --- a/apis/go/mlops/scheduler/scheduler.pb.go +++ b/apis/go/mlops/scheduler/scheduler.pb.go @@ -1,12 +1,3 @@ -/* -Copyright (c) 2024 Seldon Technologies Ltd. - -Use of this software is governed BY -(1) the license included in the LICENSE file or -(2) if the license included in the LICENSE file is the Business Source License 1.1, -the Change License after the Change Date as each is defined in accordance with the LICENSE file. -*/ - // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.34.2 diff --git a/apis/go/mlops/scheduler/scheduler_grpc.pb.go b/apis/go/mlops/scheduler/scheduler_grpc.pb.go index 2f58ce8c17..d4e6a770e7 100644 --- a/apis/go/mlops/scheduler/scheduler_grpc.pb.go +++ b/apis/go/mlops/scheduler/scheduler_grpc.pb.go @@ -1,12 +1,3 @@ -/* -Copyright (c) 2024 Seldon Technologies Ltd. - -Use of this software is governed BY -(1) the license included in the LICENSE file or -(2) if the license included in the LICENSE file is the Business Source License 1.1, -the Change License after the Change Date as each is defined in accordance with the LICENSE file. -*/ - // Code generated by protoc-gen-go-grpc. DO NOT EDIT. // versions: // - protoc-gen-go-grpc v1.4.0 diff --git a/apis/go/mlops/scheduler/storage.pb.go b/apis/go/mlops/scheduler/storage.pb.go deleted file mode 100644 index a36ddb1861..0000000000 --- a/apis/go/mlops/scheduler/storage.pb.go +++ /dev/null @@ -1,273 +0,0 @@ -/* -Copyright (c) 2024 Seldon Technologies Ltd. - -Use of this software is governed BY -(1) the license included in the LICENSE file or -(2) if the license included in the LICENSE file is the Business Source License 1.1, -the Change License after the Change Date as each is defined in accordance with the LICENSE file. -*/ - -// Code generated by protoc-gen-go. DO NOT EDIT. -// versions: -// protoc-gen-go v1.34.2 -// protoc v5.27.2 -// source: mlops/scheduler/storage.proto - -package scheduler - -import ( - protoreflect "google.golang.org/protobuf/reflect/protoreflect" - protoimpl "google.golang.org/protobuf/runtime/protoimpl" - reflect "reflect" - sync "sync" -) - -const ( - // Verify that this generated code is sufficiently up-to-date. - _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) - // Verify that runtime/protoimpl is sufficiently up-to-date. - _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) -) - -type PipelineSnapshot struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` - LastVersion uint32 `protobuf:"varint,2,opt,name=lastVersion,proto3" json:"lastVersion,omitempty"` - Versions []*PipelineWithState `protobuf:"bytes,3,rep,name=versions,proto3" json:"versions,omitempty"` - Deleted bool `protobuf:"varint,4,opt,name=deleted,proto3" json:"deleted,omitempty"` -} - -func (x *PipelineSnapshot) Reset() { - *x = PipelineSnapshot{} - if protoimpl.UnsafeEnabled { - mi := &file_mlops_scheduler_storage_proto_msgTypes[0] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *PipelineSnapshot) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*PipelineSnapshot) ProtoMessage() {} - -func (x *PipelineSnapshot) ProtoReflect() protoreflect.Message { - mi := &file_mlops_scheduler_storage_proto_msgTypes[0] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use PipelineSnapshot.ProtoReflect.Descriptor instead. -func (*PipelineSnapshot) Descriptor() ([]byte, []int) { - return file_mlops_scheduler_storage_proto_rawDescGZIP(), []int{0} -} - -func (x *PipelineSnapshot) GetName() string { - if x != nil { - return x.Name - } - return "" -} - -func (x *PipelineSnapshot) GetLastVersion() uint32 { - if x != nil { - return x.LastVersion - } - return 0 -} - -func (x *PipelineSnapshot) GetVersions() []*PipelineWithState { - if x != nil { - return x.Versions - } - return nil -} - -func (x *PipelineSnapshot) GetDeleted() bool { - if x != nil { - return x.Deleted - } - return false -} - -type ExperimentSnapshot struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - Experiment *Experiment `protobuf:"bytes,1,opt,name=experiment,proto3" json:"experiment,omitempty"` - // to mark the experiment as deleted, this is currently required as we persist all - // experiments in the local scheduler state (badgerdb) so that events can be replayed - // on restart, which would guard against lost events in communication. - Deleted bool `protobuf:"varint,2,opt,name=deleted,proto3" json:"deleted,omitempty"` -} - -func (x *ExperimentSnapshot) Reset() { - *x = ExperimentSnapshot{} - if protoimpl.UnsafeEnabled { - mi := &file_mlops_scheduler_storage_proto_msgTypes[1] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *ExperimentSnapshot) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*ExperimentSnapshot) ProtoMessage() {} - -func (x *ExperimentSnapshot) ProtoReflect() protoreflect.Message { - mi := &file_mlops_scheduler_storage_proto_msgTypes[1] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use ExperimentSnapshot.ProtoReflect.Descriptor instead. -func (*ExperimentSnapshot) Descriptor() ([]byte, []int) { - return file_mlops_scheduler_storage_proto_rawDescGZIP(), []int{1} -} - -func (x *ExperimentSnapshot) GetExperiment() *Experiment { - if x != nil { - return x.Experiment - } - return nil -} - -func (x *ExperimentSnapshot) GetDeleted() bool { - if x != nil { - return x.Deleted - } - return false -} - -var File_mlops_scheduler_storage_proto protoreflect.FileDescriptor - -var file_mlops_scheduler_storage_proto_rawDesc = []byte{ - 0x0a, 0x1d, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2f, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, - 0x72, 0x2f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, - 0x16, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, - 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x1a, 0x1f, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2f, 0x73, - 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2f, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, - 0x65, 0x72, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x22, 0xa9, 0x01, 0x0a, 0x10, 0x50, 0x69, 0x70, - 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x53, 0x6e, 0x61, 0x70, 0x73, 0x68, 0x6f, 0x74, 0x12, 0x12, 0x0a, - 0x04, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x6e, 0x61, 0x6d, - 0x65, 0x12, 0x20, 0x0a, 0x0b, 0x6c, 0x61, 0x73, 0x74, 0x56, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, - 0x18, 0x02, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x0b, 0x6c, 0x61, 0x73, 0x74, 0x56, 0x65, 0x72, 0x73, - 0x69, 0x6f, 0x6e, 0x12, 0x45, 0x0a, 0x08, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x73, 0x18, - 0x03, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x29, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, - 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x50, - 0x69, 0x70, 0x65, 0x6c, 0x69, 0x6e, 0x65, 0x57, 0x69, 0x74, 0x68, 0x53, 0x74, 0x61, 0x74, 0x65, - 0x52, 0x08, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x73, 0x12, 0x18, 0x0a, 0x07, 0x64, 0x65, - 0x6c, 0x65, 0x74, 0x65, 0x64, 0x18, 0x04, 0x20, 0x01, 0x28, 0x08, 0x52, 0x07, 0x64, 0x65, 0x6c, - 0x65, 0x74, 0x65, 0x64, 0x22, 0x72, 0x0a, 0x12, 0x45, 0x78, 0x70, 0x65, 0x72, 0x69, 0x6d, 0x65, - 0x6e, 0x74, 0x53, 0x6e, 0x61, 0x70, 0x73, 0x68, 0x6f, 0x74, 0x12, 0x42, 0x0a, 0x0a, 0x65, 0x78, - 0x70, 0x65, 0x72, 0x69, 0x6d, 0x65, 0x6e, 0x74, 0x18, 0x01, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x22, - 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, - 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x45, 0x78, 0x70, 0x65, 0x72, 0x69, 0x6d, 0x65, - 0x6e, 0x74, 0x52, 0x0a, 0x65, 0x78, 0x70, 0x65, 0x72, 0x69, 0x6d, 0x65, 0x6e, 0x74, 0x12, 0x18, - 0x0a, 0x07, 0x64, 0x65, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x18, 0x02, 0x20, 0x01, 0x28, 0x08, 0x52, - 0x07, 0x64, 0x65, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x42, 0x3c, 0x5a, 0x3a, 0x67, 0x69, 0x74, 0x68, - 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x69, 0x6f, 0x2f, - 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2d, 0x63, 0x6f, 0x72, 0x65, 0x2f, 0x61, 0x70, 0x69, 0x73, - 0x2f, 0x67, 0x6f, 0x2f, 0x76, 0x32, 0x2f, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2f, 0x73, 0x63, 0x68, - 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, -} - -var ( - file_mlops_scheduler_storage_proto_rawDescOnce sync.Once - file_mlops_scheduler_storage_proto_rawDescData = file_mlops_scheduler_storage_proto_rawDesc -) - -func file_mlops_scheduler_storage_proto_rawDescGZIP() []byte { - file_mlops_scheduler_storage_proto_rawDescOnce.Do(func() { - file_mlops_scheduler_storage_proto_rawDescData = protoimpl.X.CompressGZIP(file_mlops_scheduler_storage_proto_rawDescData) - }) - return file_mlops_scheduler_storage_proto_rawDescData -} - -var file_mlops_scheduler_storage_proto_msgTypes = make([]protoimpl.MessageInfo, 2) -var file_mlops_scheduler_storage_proto_goTypes = []any{ - (*PipelineSnapshot)(nil), // 0: seldon.mlops.scheduler.PipelineSnapshot - (*ExperimentSnapshot)(nil), // 1: seldon.mlops.scheduler.ExperimentSnapshot - (*PipelineWithState)(nil), // 2: seldon.mlops.scheduler.PipelineWithState - (*Experiment)(nil), // 3: seldon.mlops.scheduler.Experiment -} -var file_mlops_scheduler_storage_proto_depIdxs = []int32{ - 2, // 0: seldon.mlops.scheduler.PipelineSnapshot.versions:type_name -> seldon.mlops.scheduler.PipelineWithState - 3, // 1: seldon.mlops.scheduler.ExperimentSnapshot.experiment:type_name -> seldon.mlops.scheduler.Experiment - 2, // [2:2] is the sub-list for method output_type - 2, // [2:2] is the sub-list for method input_type - 2, // [2:2] is the sub-list for extension type_name - 2, // [2:2] is the sub-list for extension extendee - 0, // [0:2] is the sub-list for field type_name -} - -func init() { file_mlops_scheduler_storage_proto_init() } -func file_mlops_scheduler_storage_proto_init() { - if File_mlops_scheduler_storage_proto != nil { - return - } - file_mlops_scheduler_scheduler_proto_init() - if !protoimpl.UnsafeEnabled { - file_mlops_scheduler_storage_proto_msgTypes[0].Exporter = func(v any, i int) any { - switch v := v.(*PipelineSnapshot); i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_mlops_scheduler_storage_proto_msgTypes[1].Exporter = func(v any, i int) any { - switch v := v.(*ExperimentSnapshot); i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - } - type x struct{} - out := protoimpl.TypeBuilder{ - File: protoimpl.DescBuilder{ - GoPackagePath: reflect.TypeOf(x{}).PkgPath(), - RawDescriptor: file_mlops_scheduler_storage_proto_rawDesc, - NumEnums: 0, - NumMessages: 2, - NumExtensions: 0, - NumServices: 0, - }, - GoTypes: file_mlops_scheduler_storage_proto_goTypes, - DependencyIndexes: file_mlops_scheduler_storage_proto_depIdxs, - MessageInfos: file_mlops_scheduler_storage_proto_msgTypes, - }.Build() - File_mlops_scheduler_storage_proto = out.File - file_mlops_scheduler_storage_proto_rawDesc = nil - file_mlops_scheduler_storage_proto_goTypes = nil - file_mlops_scheduler_storage_proto_depIdxs = nil -} diff --git a/apis/mlops/scheduler/db/db.proto b/apis/mlops/scheduler/db/db.proto index f647128ff4..fbfc776bd6 100644 --- a/apis/mlops/scheduler/db/db.proto +++ b/apis/mlops/scheduler/db/db.proto @@ -7,6 +7,29 @@ option go_package = "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/ import "google/protobuf/timestamp.proto"; import "mlops/scheduler/scheduler.proto"; +// +// PIPELINES AND EXPERIMENTS +// + +message PipelineSnapshot { + string name = 1; + uint32 lastVersion = 2; + repeated PipelineWithState versions = 3; + bool deleted = 4; +} + +message ExperimentSnapshot { + Experiment experiment = 1; + // to mark the experiment as deleted, this is currently required as we persist all + // experiments in the local scheduler state (badgerdb) so that events can be replayed + // on restart, which would guard against lost events in communication. + bool deleted = 2; +} + +// +// MODELS AND SERVERS +// + // ModelState represents the state of a model enum ModelState { MODEL_STATE_UNKNOWN = 0; @@ -58,18 +81,12 @@ message ModelStatus { google.protobuf.Timestamp timestamp = 8; } -// ReplicaStatusEntry for map representation -message ReplicaStatusEntry { - int32 replica_index = 1; - ReplicaStatus status = 2; -} - // ModelVersion represents a version of a model message ModelVersion { seldon.mlops.scheduler.Model model_defn = 1; uint32 version = 2; string server = 3; - map replicas = 4; + map replicas = 4; ModelStatus state = 5; } @@ -97,12 +114,6 @@ message Server { seldon.mlops.scheduler.KubernetesMeta kubernetes_meta = 7; } -// ModelLoadStatus represents a model's load status (for maps that use ModelVersionID as key) -message ModelLoadStatus { - ModelVersionID model_version_id = 1; - bool loaded = 2; -} - // ServerReplica represents a single replica of a server message ServerReplica { string inference_svc = 1; @@ -113,9 +124,8 @@ message ServerReplica { repeated string capabilities = 6; uint64 memory = 7; uint64 available_memory = 8; - // Using repeated messages since proto doesn't support complex map keys - repeated ModelLoadStatus loaded_models = 9; - repeated ModelLoadStatus loading_models = 10; + repeated ModelVersionID loaded_models = 9; + repeated ModelVersionID loading_models = 10; uint32 over_commit_percentage = 11; uint64 reserved_memory = 12; map unique_loaded_models = 13; diff --git a/apis/mlops/scheduler/storage.proto b/apis/mlops/scheduler/storage.proto deleted file mode 100644 index 69784b3c1c..0000000000 --- a/apis/mlops/scheduler/storage.proto +++ /dev/null @@ -1,23 +0,0 @@ -syntax = "proto3"; - -package seldon.mlops.scheduler; - -option go_package = "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler"; - -import "mlops/scheduler/scheduler.proto"; - -message PipelineSnapshot { - string name = 1; - uint32 lastVersion = 2; - repeated PipelineWithState versions = 3; - bool deleted = 4; -} - -message ExperimentSnapshot { - Experiment experiment = 1; - // to mark the experiment as deleted, this is currently required as we persist all - // experiments in the local scheduler state (badgerdb) so that events can be replayed - // on restart, which would guard against lost events in communication. - bool deleted = 2; -} - From 2ab0cd0a6bd8c5b95a0bdfb3069cc9b5fbd23a8f Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Mon, 29 Dec 2025 22:37:46 +0000 Subject: [PATCH 05/50] abstracted storage, now uses proto to store model/servers --- scheduler/cmd/scheduler/main.go | 3 +- scheduler/pkg/agent/server.go | 5 +- scheduler/pkg/store/experiment/db.go | 7 +- scheduler/pkg/store/experiment/state.go | 6 +- scheduler/pkg/store/experiment/store.go | 5 +- scheduler/pkg/store/experiment/utils.go | 7 +- scheduler/pkg/store/in-memory/store.go | 2 + scheduler/pkg/store/memory.go | 283 +++++++++++++----------- scheduler/pkg/store/memory_status.go | 74 ++++--- scheduler/pkg/store/mesh.go | 52 ++--- scheduler/pkg/store/pipeline/status.go | 5 +- scheduler/pkg/store/pipeline/store.go | 5 +- scheduler/pkg/store/pipeline/utils.go | 7 +- scheduler/pkg/store/store.go | 2 +- 14 files changed, 248 insertions(+), 215 deletions(-) diff --git a/scheduler/cmd/scheduler/main.go b/scheduler/cmd/scheduler/main.go index b30cdc989c..fd28d90904 100644 --- a/scheduler/cmd/scheduler/main.go +++ b/scheduler/cmd/scheduler/main.go @@ -23,6 +23,7 @@ import ( "syscall" "time" + in_memory "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/in-memory" log "github.com/sirupsen/logrus" "google.golang.org/grpc" "google.golang.org/grpc/backoff" @@ -298,7 +299,7 @@ func main() { } // Create stores - ss := store.NewMemoryStore(logger, store.NewLocalSchedulerStore(), eventHub) + ss := store.NewMemoryStore(logger, in_memory.NewStorage(), eventHub) ps := pipeline.NewPipelineStore(logger, eventHub, ss) es := experiment.NewExperimentServer(logger, eventHub, ss, ps) cleaner := cleaner.NewVersionCleaner(ss, logger) diff --git a/scheduler/pkg/agent/server.go b/scheduler/pkg/agent/server.go index 35c8233fa3..31338d6b2d 100644 --- a/scheduler/pkg/agent/server.go +++ b/scheduler/pkg/agent/server.go @@ -18,6 +18,7 @@ import ( "sync" "time" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" log "github.com/sirupsen/logrus" "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" "google.golang.org/grpc" @@ -267,12 +268,12 @@ func (s *Server) Sync(modelName string) { return } - latestModel := model.GetLatest() + latestModel := model.Latest() // we signal a model when other replica is available in case we have servers draining // TODO: extract as helper func if latestModel != nil { - available := latestModel.GetReplicaForState(store.Available) + available := latestModel.GetReplicaForState(db.ModelReplicaState_MODEL_REPLICA_STATE_AVAILABLE) if len(available) > 0 { s.waiter.signalModel(modelName) } diff --git a/scheduler/pkg/store/experiment/db.go b/scheduler/pkg/store/experiment/db.go index 391a6859f6..89ba4751b6 100644 --- a/scheduler/pkg/store/experiment/db.go +++ b/scheduler/pkg/store/experiment/db.go @@ -13,11 +13,10 @@ import ( "time" "github.com/dgraph-io/badger/v3" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/sirupsen/logrus" "google.golang.org/protobuf/proto" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" - "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/utils" ) @@ -115,7 +114,7 @@ func (edb *ExperimentDBManager) restore( continue } err := item.Value(func(v []byte) error { - snapshot := scheduler.ExperimentSnapshot{} + snapshot := db.ExperimentSnapshot{} err := proto.Unmarshal(v, &snapshot) if err != nil { return err @@ -152,7 +151,7 @@ func (edb *ExperimentDBManager) get(name string) (*Experiment, error) { return err } return item.Value(func(v []byte) error { - snapshot := scheduler.ExperimentSnapshot{} + snapshot := db.ExperimentSnapshot{} err = proto.Unmarshal(v, &snapshot) if err != nil { return err diff --git a/scheduler/pkg/store/experiment/state.go b/scheduler/pkg/store/experiment/state.go index aacfae1526..d3c493203b 100644 --- a/scheduler/pkg/store/experiment/state.go +++ b/scheduler/pkg/store/experiment/state.go @@ -10,7 +10,7 @@ the Change License after the Change Date as each is defined in accordance with t package experiment import ( - "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" pipeline2 "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/pipeline" ) @@ -161,7 +161,7 @@ func (es *ExperimentStore) setCandidateAndMirrorReadiness(experiment *Experiment if err != nil { logger.WithError(err).Infof("Failed to get model %s for candidate check for experiment %s", candidate.Name, experiment.Name) } else { - if model.GetLatest() != nil && model.GetLatest().ModelState().State == store.ModelAvailable { + if model.Latest() != nil && model.Latest().State.State == db.ModelState_MODEL_STATE_AVAILABLE { candidate.Ready = true } else { candidate.Ready = false @@ -173,7 +173,7 @@ func (es *ExperimentStore) setCandidateAndMirrorReadiness(experiment *Experiment if err != nil { logger.WithError(err).Warnf("Failed to get model %s for mirror check for experiment %s", experiment.Mirror.Name, experiment.Name) } else { - if model.GetLatest() != nil && model.GetLatest().ModelState().State == store.ModelAvailable { + if model.Latest() != nil && model.Latest().State.State == db.ModelState_MODEL_STATE_AVAILABLE { experiment.Mirror.Ready = true } else { experiment.Mirror.Ready = false diff --git a/scheduler/pkg/store/experiment/store.go b/scheduler/pkg/store/experiment/store.go index 50324cca10..df997afc25 100644 --- a/scheduler/pkg/store/experiment/store.go +++ b/scheduler/pkg/store/experiment/store.go @@ -17,6 +17,7 @@ import ( "time" "github.com/mitchellh/copystructure" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/sirupsen/logrus" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" @@ -194,7 +195,7 @@ func (es *ExperimentStore) handleModelEvents(event coordinator.ModelEventMsg) { if err != nil { logger.WithError(err).Warnf("Failed to get model %s for candidate check for experiment %s", event.ModelName, experiment.Name) } else { - if model.GetLatest() != nil && model.GetLatest().ModelState().State == store.ModelAvailable { + if model.Latest() != nil && model.Latest().State.State == db.ModelState_MODEL_STATE_AVAILABLE { candidate.Ready = true } else { candidate.Ready = false @@ -208,7 +209,7 @@ func (es *ExperimentStore) handleModelEvents(event coordinator.ModelEventMsg) { if err != nil { logger.WithError(err).Warnf("Failed to get model %s for mirror check for experiment %s", event.ModelName, experiment.Name) } else { - if model.GetLatest() != nil && model.GetLatest().ModelState().State == store.ModelAvailable { + if model.Latest() != nil && model.Latest().State.State == db.ModelState_MODEL_STATE_AVAILABLE { experiment.Mirror.Ready = true } else { experiment.Mirror.Ready = false diff --git a/scheduler/pkg/store/experiment/utils.go b/scheduler/pkg/store/experiment/utils.go index 6b72e7e22a..12bf4b8f74 100644 --- a/scheduler/pkg/store/experiment/utils.go +++ b/scheduler/pkg/store/experiment/utils.go @@ -11,6 +11,7 @@ package experiment import ( "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" ) func CreateExperimentFromRequest(request *scheduler.Experiment) *Experiment { @@ -61,13 +62,13 @@ func CreateExperimentFromRequest(request *scheduler.Experiment) *Experiment { } } -func CreateExperimentFromSnapshot(request *scheduler.ExperimentSnapshot) *Experiment { +func CreateExperimentFromSnapshot(request *db.ExperimentSnapshot) *Experiment { experiment := CreateExperimentFromRequest(request.Experiment) experiment.Deleted = request.Deleted return experiment } -func CreateExperimentSnapshotProto(experiment *Experiment) *scheduler.ExperimentSnapshot { +func CreateExperimentSnapshotProto(experiment *Experiment) *db.ExperimentSnapshot { var candidates []*scheduler.ExperimentCandidate for _, candidate := range experiment.Candidates { candidates = append(candidates, &scheduler.ExperimentCandidate{ @@ -102,7 +103,7 @@ func CreateExperimentSnapshotProto(experiment *Experiment) *scheduler.Experiment case ModelResourceType: resourceType = scheduler.ResourceType_MODEL } - return &scheduler.ExperimentSnapshot{ + return &db.ExperimentSnapshot{ Experiment: &scheduler.Experiment{ Name: experiment.Name, Default: experiment.Default, diff --git a/scheduler/pkg/store/in-memory/store.go b/scheduler/pkg/store/in-memory/store.go index 0e11dd5589..8d9bd0c3fa 100644 --- a/scheduler/pkg/store/in-memory/store.go +++ b/scheduler/pkg/store/in-memory/store.go @@ -15,6 +15,8 @@ type Storage struct { models map[string]*db.Model } +var _ store.Storage = &Storage{} + func NewStorage() *Storage { return &Storage{ servers: make(map[string]*db.Server), diff --git a/scheduler/pkg/store/memory.go b/scheduler/pkg/store/memory.go index d15c9d6640..e91e9453eb 100644 --- a/scheduler/pkg/store/memory.go +++ b/scheduler/pkg/store/memory.go @@ -69,29 +69,30 @@ func (m *MemoryStore) GetModels() ([]*db.Model, error) { return m.store.ListModels() } -func (m *MemoryStore) addModelVersionIfNotExists(req *agent.ModelVersion) (*Model, *ModelVersion, error) { +func (m *MemoryStore) addModelVersionIfNotExists(req *agent.ModelVersion) (*db.Model, *db.ModelVersion, error) { modelName := req.GetModel().GetMeta().GetName() model, err := m.store.GetModel(modelName) if err != nil { if !errors.Is(err, ErrNotFound) { return nil, nil, err } - // TODO pointer save issue - model = &Model{} + model = &db.Model{Name: modelName} if err := m.store.AddModel(model); err != nil { return nil, nil, err } } if existingModelVersion := model.GetVersion(req.GetVersion()); existingModelVersion == nil { modelVersion := NewDefaultModelVersion(req.GetModel(), req.GetVersion()) - model.versions = append(model.versions, modelVersion) - sort.SliceStable(model.versions, func(i, j int) bool { // resort model versions based on version number - return model.versions[i].GetVersion() < model.versions[j].GetVersion() + model.Versions = append(model.Versions, modelVersion) + sort.SliceStable(model.Versions, func(i, j int) bool { // resort model versions based on version number + return model.Versions[i].GetVersion() < model.Versions[j].GetVersion() }) return model, modelVersion, nil } else { return model, existingModelVersion, nil } + + // TODO pointer save issue } func (m *MemoryStore) addNextModelVersion(model *db.Model, pbmodel *pb.Model) { @@ -189,18 +190,6 @@ func (m *MemoryStore) UpdateModel(req *pb.LoadModelRequest) error { return nil } -func (m *MemoryStore) getModelImpl(key string) *ModelSnapshot { - model, err := m.store.GetModel(key) - if err == nil { - return m.deepCopy(model, key) - } - - return &ModelSnapshot{ - Name: key, - Versions: nil, - } -} - func (m *MemoryStore) deepCopy(model *Model, key string) *ModelSnapshot { snapshot := &ModelSnapshot{ Name: key, @@ -233,7 +222,11 @@ func (m *MemoryStore) UnlockModel(modelId string) { func (m *MemoryStore) GetModel(key string) (*db.Model, error) { m.mu.RLock() defer m.mu.RUnlock() - return m.getModelImpl(key), nil + model, err := m.store.GetModel(key) + if err != nil { + return nil, fmt.Errorf("failed to get model %s: %w", key, err) + } + return model, nil } func (m *MemoryStore) RemoveModel(req *pb.UnloadModelRequest) error { @@ -281,29 +274,28 @@ func (m *MemoryStore) GetServers(shallow bool, modelDetails bool) ([]*db.Server, return servers, nil } -func (m *MemoryStore) GetServer(serverKey string, shallow bool, modelDetails bool) (*db.Server, error) { +func (m *MemoryStore) GetServer(serverKey string, shallow bool, modelDetails bool) (*db.Server, *ServerStats, error) { m.mu.RLock() defer m.mu.RUnlock() server, err := m.store.GetServer(serverKey) if err != nil { if errors.Is(err, ErrNotFound) { - return nil, fmt.Errorf("server [%s] not found", serverKey) + return nil, nil, fmt.Errorf("server [%s] not found", serverKey) } - return nil, err + return nil, nil, err } - // TODO: refactor cleanly - snapshot := server.CreateSnapshot(shallow, modelDetails) if modelDetails { // this is a hint to the caller that the server is in a state where it can be scaled down - snapshot.Stats, err = m.getServerStats(serverKey) + stats, err := m.getServerStats(serverKey) if err != nil { - return nil, err + return nil, nil, fmt.Errorf("failed to get stats: %w", err) } + return server, stats, nil } - return snapshot, nil + return server, nil, nil } func (m *MemoryStore) getServerStats(serverKey string) (*ServerStats, error) { @@ -327,7 +319,7 @@ func (m *MemoryStore) getModelServer( modelKey string, version uint32, serverKey string, -) (*Model, *ModelVersion, *Server, error) { +) (*db.Model, *db.ModelVersion, *db.Server, error) { // Validate model, err := m.store.GetModel(modelKey) if err != nil { @@ -369,7 +361,7 @@ func (m *MemoryStore) updateLoadedModelsImpl( modelKey string, version uint32, serverKey string, - replicas []*ServerReplica, + replicas []*db.ServerReplica, ) (*coordinator.ModelEventMsg, error) { logger := m.logger.WithField("func", "updateLoadedModelsImpl") @@ -378,15 +370,18 @@ func (m *MemoryStore) updateLoadedModelsImpl( model, err := m.store.GetModel(modelKey) if err != nil { if errors.Is(err, ErrNotFound) { - return nil, fmt.Errorf("Model [%s] not found", modelKey) + return nil, fmt.Errorf("model [%s] not found", modelKey) } return nil, fmt.Errorf("failed to find model %s", modelKey) } modelVersion := model.Latest() + if modelVersion == nil { + return nil, fmt.Errorf("latest version not found for model %s", modelKey) + } if version != modelVersion.GetVersion() { return nil, fmt.Errorf( - "Model version mismatch for %s got %d but latest version is now %d", + "model version mismatch for %s got %d but latest version is now %d", modelKey, version, modelVersion.GetVersion(), ) } @@ -394,7 +389,7 @@ func (m *MemoryStore) updateLoadedModelsImpl( if serverKey == "" { // nothing to do for a model that doesn't have a server, proceed with sending an event for downstream return &coordinator.ModelEventMsg{ - ModelName: modelVersion.GetMeta().GetName(), + ModelName: modelVersion.ModelDefn.Meta.Name, ModelVersion: modelVersion.GetVersion(), }, nil } @@ -404,56 +399,56 @@ func (m *MemoryStore) updateLoadedModelsImpl( return nil, fmt.Errorf("failed to find server %s", serverKey) } - assignedReplicaIds := make(map[int]struct{}) + assignedReplicaIds := make(map[int32]struct{}) for _, replica := range replicas { - if _, ok := server.replicas[replica.replicaIdx]; !ok { + if _, ok := server.Replicas[replica.ReplicaIdx]; !ok { return nil, fmt.Errorf( "failed to reserve replica %d as it does not exist on server %s", - replica.replicaIdx, serverKey, + replica.ReplicaIdx, serverKey, ) } - assignedReplicaIds[replica.replicaIdx] = struct{}{} + assignedReplicaIds[replica.ReplicaIdx] = struct{}{} } - for modelVersion.HasServer() && modelVersion.Server() != serverKey { - logger.Debugf("Adding new version as server changed to %s from %s", modelVersion.Server(), serverKey) - m.addNextModelVersion(model, model.Latest().modelDefn) + for modelVersion.HasServer() && modelVersion.Server != serverKey { + logger.Debugf("Adding new version as server changed to %s from %s", modelVersion.Server, serverKey) + m.addNextModelVersion(model, model.Latest().ModelDefn) modelVersion = model.Latest() } // reserve memory for existing replicas that are not already loading or loaded replicaStateUpdated := false for replicaIdx := range assignedReplicaIds { - if existingState, ok := modelVersion.replicas[replicaIdx]; !ok { + if existingState, ok := modelVersion.Replicas[replicaIdx]; !ok { logger.Debugf( "Model %s version %d state %s on server %s replica %d does not exist yet and should be loaded", - modelKey, modelVersion.version, existingState.State.String(), serverKey, replicaIdx, + modelKey, modelVersion.Version, existingState.State.String(), serverKey, replicaIdx, ) - modelVersion.SetReplicaState(replicaIdx, LoadRequested, "") - m.updateReservedMemory(LoadRequested, serverKey, replicaIdx, modelVersion.GetRequiredMemory()) + modelVersion.SetReplicaState(int(replicaIdx), db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_REQUESTED, "") + m.updateReservedMemory(db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_REQUESTED, serverKey, int(replicaIdx), modelVersion.GetRequiredMemory()) replicaStateUpdated = true } else { logger.Debugf( "Checking if model %s version %d state %s on server %s replica %d should be loaded", - modelKey, modelVersion.version, existingState.State.String(), serverKey, replicaIdx, + modelKey, modelVersion.Version, existingState.State.String(), serverKey, replicaIdx, ) if !existingState.State.AlreadyLoadingOrLoaded() { - modelVersion.SetReplicaState(replicaIdx, LoadRequested, "") - m.updateReservedMemory(LoadRequested, serverKey, replicaIdx, modelVersion.GetRequiredMemory()) + modelVersion.SetReplicaState(int(replicaIdx), db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_REQUESTED, "") + m.updateReservedMemory(db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_REQUESTED, serverKey, int(replicaIdx), modelVersion.GetRequiredMemory()) replicaStateUpdated = true } } } // Unload any existing model replicas assignments that are no longer part of the replica set - for replicaIdx, existingState := range modelVersion.ReplicaState() { + for replicaIdx, existingState := range modelVersion.Replicas { if _, ok := assignedReplicaIds[replicaIdx]; !ok { logger.Debugf( "Checking if replicaidx %d with state %s should be unloaded", replicaIdx, existingState.State.String(), ) - if !existingState.State.UnloadingOrUnloaded() && existingState.State != Draining { - modelVersion.SetReplicaState(replicaIdx, UnloadEnvoyRequested, "") + if !existingState.State.UnloadingOrUnloaded() && existingState.State != db.ModelReplicaState_MODEL_REPLICA_STATE_DRAINING { + modelVersion.SetReplicaState(int(replicaIdx), db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_ENVOY_REQUESTED, "") replicaStateUpdated = true } } @@ -469,21 +464,22 @@ func (m *MemoryStore) updateLoadedModelsImpl( // in modelVersion.state.AvailableReplicas (we call updateModelStatus later) // TODO: the conditions here keep growing, refactor or consider a simpler check. - if replicaStateUpdated || modelVersion.state.State == ScheduleFailed || model.IsDeleted() || modelVersion.state.State == ModelProgressing || - (modelVersion.state.State == ModelAvailable && len(modelVersion.GetAssignment()) < modelVersion.DesiredReplicas()) { + if replicaStateUpdated || modelVersion.State.State == db.ModelState_MODEL_STATE_SCHEDULE_FAILED || model.Deleted || + modelVersion.State.State == db.ModelState_MODEL_STATE_PROGRESSING || + (modelVersion.State.State == db.ModelState_MODEL_STATE_AVAILABLE && len(modelVersion.GetAssignment()) < modelVersion.DesiredReplicas()) { logger.Debugf("Updating model status for model %s server %s", modelKey, serverKey) - modelVersion.SetServer(serverKey) - m.updateModelStatus(true, model.IsDeleted(), modelVersion, model.GetLastAvailableModelVersion()) + modelVersion.Server = serverKey + m.updateModelStatus(true, model.Deleted, modelVersion, model.GetLastAvailableModelVersion()) return &coordinator.ModelEventMsg{ - ModelName: modelVersion.GetMeta().GetName(), + ModelName: modelVersion.ModelDefn.Meta.Name, ModelVersion: modelVersion.GetVersion(), }, nil - } else { - logger.Debugf("Model status update not required for model %s server %s as no replicas were updated", modelKey, serverKey) - return nil, nil } + + logger.Debugf("Model status update not required for model %s server %s as no replicas were updated", modelKey, serverKey) + return nil, nil } func (m *MemoryStore) UnloadVersionModels(modelKey string, version uint32) (bool, error) { @@ -517,33 +513,34 @@ func (m *MemoryStore) unloadVersionModelsImpl(modelKey string, version uint32) ( } updated := false - for replicaIdx, existingState := range modelVersion.ReplicaState() { + for replicaIdx, existingState := range modelVersion.Replicas { if !existingState.State.UnloadingOrUnloaded() { logger.Debugf( "Setting model %s version %d on server %s replica %d to UnloadRequested was %s", modelKey, - modelVersion.version, - modelVersion.Server(), + modelVersion.Version, + modelVersion.Server, replicaIdx, existingState.State.String(), ) - modelVersion.SetReplicaState(replicaIdx, UnloadRequested, "") + modelVersion.SetReplicaState(int(replicaIdx), db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_REQUESTED, "") updated = true - } else { - logger.Debugf( - "model %s on server %s replica %d already unloaded", - modelKey, modelVersion.Server(), replicaIdx, - ) + continue } + logger.Debugf( + "model %s on server %s replica %d already unloaded", + modelKey, modelVersion.Server, replicaIdx, + ) } if updated { logger.Debugf("Calling update model status for model %s version %d", modelKey, version) - m.updateModelStatus(false, model.IsDeleted(), modelVersion, model.GetLastAvailableModelVersion()) + m.updateModelStatus(false, model.Deleted, modelVersion, model.GetLastAvailableModelVersion()) return &coordinator.ModelEventMsg{ - ModelName: modelVersion.GetMeta().GetName(), + ModelName: modelVersion.ModelDefn.Meta.Name, ModelVersion: modelVersion.GetVersion(), }, true, nil } + return nil, false, nil } @@ -583,8 +580,8 @@ func (m *MemoryStore) updateModelStateImpl( serverKey string, replicaIdx int, availableMemory *uint64, - expectedState ModelReplicaState, - desiredState ModelReplicaState, + expectedState db.ModelReplicaState, + desiredState db.ModelReplicaState, reason string, runtimeInfo *pb.ModelRuntimeInfo, ) (*coordinator.ModelEventMsg, *coordinator.ServerEventMsg, error) { @@ -604,7 +601,7 @@ func (m *MemoryStore) updateModelStateImpl( if existingState != expectedState { return nil, nil, fmt.Errorf( - "State mismatch for %s:%d expected state %s but was %s when trying to move to state %s", + "state mismatch for %s:%d expected state %s but was %s when trying to move to state %s", modelKey, version, expectedState.String(), existingState.String(), desiredState.String(), ) } @@ -623,19 +620,22 @@ func (m *MemoryStore) updateModelStateImpl( ) // Update models loaded onto replica for relevant state - if desiredState == Loaded || desiredState == Loading || desiredState == Unloaded || desiredState == LoadFailed { + if desiredState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOADED || + desiredState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOADING || + desiredState == db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOADED || + desiredState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_FAILED { // TODO pointer server, err := m.store.GetServer(serverKey) // TODO handle error if err == nil { - replica, ok := server.replicas[replicaIdx] + replica, ok := server.Replicas[int32(replicaIdx)] if ok { - if desiredState == Loaded || desiredState == Loading { + if desiredState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOADED || desiredState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOADING { logger.Infof( "Adding model %s(%d) to server %s replica %d list of loaded / loading models", modelKey, version, serverKey, replicaIdx, ) - replica.addModelVersion(modelKey, version, desiredState) // we need to distinguish between loaded and loading + replica.AddModelVersion(modelKey, version, desiredState) // we need to distinguish between loaded and loading } else { logger.Infof( "Removing model %s(%d) from server %s replica %d list of loaded / loading models", @@ -643,18 +643,18 @@ func (m *MemoryStore) updateModelStateImpl( ) // we could go from loaded -> unloaded or loading -> failed. in the case we have a failure then we just remove from loading deletedModelReplica = true - replica.deleteModelVersion(modelKey, version) + replica.DeleteModelVersion(modelKey, version) } } } } if availableMemory != nil { - server.replicas[replicaIdx].availableMemory = *availableMemory + server.Replicas[int32(replicaIdx)].AvailableMemory = *availableMemory } - m.updateModelStatus(isLatest, model.IsDeleted(), modelVersion, model.GetLastAvailableModelVersion()) + m.updateModelStatus(isLatest, model.Deleted, modelVersion, model.GetLastAvailableModelVersion()) modelEvt := &coordinator.ModelEventMsg{ - ModelName: modelVersion.GetMeta().GetName(), + ModelName: modelVersion.ModelDefn.Meta.Name, ModelVersion: modelVersion.GetVersion(), } if deletedModelReplica { @@ -675,18 +675,19 @@ func (m *MemoryStore) updateModelStateImpl( } func (m *MemoryStore) updateReservedMemory( - modelReplicaState ModelReplicaState, serverKey string, replicaIdx int, memBytes uint64, + modelReplicaState db.ModelReplicaState, serverKey string, replicaIdx int, memBytes uint64, ) { // update reserved memory that is being used for sorting replicas // do we need to lock replica update? server, err := m.store.GetServer(serverKey) // TODO handle error?? if err == nil { - replica, okReplica := server.replicas[replicaIdx] + replica, okReplica := server.Replicas[int32(replicaIdx)] if okReplica { - if modelReplicaState == LoadRequested { + if modelReplicaState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_REQUESTED { replica.UpdateReservedMemory(memBytes, true) - } else if modelReplicaState == LoadFailed || modelReplicaState == Loaded { + } else if modelReplicaState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_FAILED || + modelReplicaState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOADED { replica.UpdateReservedMemory(memBytes, false) } } @@ -729,7 +730,7 @@ func (m *MemoryStore) addServerReplicaImpl(request *agent.AgentSubscribeRequest) return nil, coordinator.ServerEventMsg{}, fmt.Errorf("failed to add server %s: %w", request.ServerName, err) } } - server.shared = request.Shared + server.Shared = request.Shared loadedModels := toSchedulerLoadedModels(request.LoadedModels) @@ -740,7 +741,7 @@ func (m *MemoryStore) addServerReplicaImpl(request *agent.AgentSubscribeRequest) request.ReplicaConfig, request.AvailableMemoryBytes, ) - server.replicas[int(request.ReplicaIdx)] = serverReplica + server.Replicas[int32(request.ReplicaIdx)] = serverReplica var evts []coordinator.ModelEventMsg for _, modelVersionReq := range request.LoadedModels { @@ -748,18 +749,18 @@ func (m *MemoryStore) addServerReplicaImpl(request *agent.AgentSubscribeRequest) if err != nil { return nil, coordinator.ServerEventMsg{}, err } - modelVersion.replicas[int(request.ReplicaIdx)] = ReplicaStatus{State: Loaded} - modelVersion.SetServer(request.ServerName) + modelVersion.Replicas[int32(request.ReplicaIdx)] = &db.ReplicaStatus{State: db.ModelReplicaState_MODEL_REPLICA_STATE_LOADED} + modelVersion.Server = request.ServerName m.updateModelStatus(true, false, modelVersion, model.GetLastAvailableModelVersion()) evts = append(evts, coordinator.ModelEventMsg{ - ModelName: modelVersion.GetMeta().GetName(), + ModelName: modelVersion.ModelDefn.Meta.Name, ModelVersion: modelVersion.GetVersion(), }) } serverEvt := coordinator.ServerEventMsg{ ServerName: request.ServerName, - ServerIdx: uint32(request.ReplicaIdx), + ServerIdx: request.ReplicaIdx, UpdateContext: coordinator.SERVER_REPLICA_CONNECTED, } @@ -790,19 +791,19 @@ func (m *MemoryStore) removeServerReplicaImpl(serverName string, replicaIdx int) if err != nil { return nil, nil, fmt.Errorf("failed to find server %s: %w", serverName, err) } - serverReplica, ok := server.replicas[replicaIdx] + serverReplica, ok := server.Replicas[int32(replicaIdx)] if !ok { return nil, nil, fmt.Errorf("Failed to find replica %d for server %s", replicaIdx, serverName) } - delete(server.replicas, replicaIdx) + delete(server.Replicas, int32(replicaIdx)) // TODO we should not reschedule models on servers with dedicated models, e.g. non shareable servers - if len(server.replicas) == 0 { + if len(server.Replicas) == 0 { if err := m.store.DeleteServer(serverName); err != nil { return nil, nil, fmt.Errorf("failed to delete server %s: %w", serverName, err) } } - loadedModelsRemoved, loadedEvts := m.removeModelfromServerReplica(serverReplica.loadedModels, replicaIdx) - loadingModelsRemoved, loadingEtvs := m.removeModelfromServerReplica(serverReplica.loadingModels, replicaIdx) + loadedModelsRemoved, loadedEvts := m.removeModelfromServerReplica(serverReplica.LoadedModels, replicaIdx) + loadingModelsRemoved, loadingEtvs := m.removeModelfromServerReplica(serverReplica.LoadingModels, replicaIdx) modelsRemoved := append(loadedModelsRemoved, loadingModelsRemoved...) evts := append(loadedEvts, loadingEtvs...) @@ -810,16 +811,16 @@ func (m *MemoryStore) removeServerReplicaImpl(serverName string, replicaIdx int) return modelsRemoved, evts, nil } -func (m *MemoryStore) removeModelfromServerReplica(lModels map[ModelVersionID]bool, replicaIdx int) ([]string, []coordinator.ModelEventMsg) { +func (m *MemoryStore) removeModelfromServerReplica(models []*db.ModelVersionID, replicaIdx int) ([]string, []coordinator.ModelEventMsg) { logger := m.logger.WithField("func", "RemoveServerReplica") var modelNames []string var evts []coordinator.ModelEventMsg // Find models to reschedule due to this server replica being removed - for modelVersionID := range lModels { + for _, v := range models { // TODO pointer issue - model, err := m.store.GetModel(modelVersionID.Name) + model, err := m.store.GetModel(v.Name) if err == nil { - modelVersion := model.GetVersion(modelVersionID.Version) + modelVersion := model.GetVersion(v.Version) if modelVersion != nil { modelVersion.DeleteReplica(replicaIdx) if model.Deleted || model.Latest().GetVersion() != modelVersion.GetVersion() { @@ -827,27 +828,28 @@ func (m *MemoryStore) removeModelfromServerReplica(lModels map[ModelVersionID]bo // is still in the process of being loaded. In this case we should not reschedule the model. logger.Debugf( "Model %s is being deleted and server replica %d is disconnected, skipping", - modelVersionID.Name, replicaIdx, + v.Name, replicaIdx, ) - modelVersion.SetReplicaState(replicaIdx, Unloaded, "model is removed when server replica was removed") - m.LockModel(modelVersionID.Name) + modelVersion.SetReplicaState(replicaIdx, db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOADED, + "model is removed when server replica was removed") + m.LockModel(v.Name) m.updateModelStatus( model.Latest().GetVersion() == modelVersion.GetVersion(), - model.IsDeleted(), modelVersion, model.GetLastAvailableModelVersion()) - m.UnlockModel(modelVersionID.Name) + model.Deleted, modelVersion, model.GetLastAvailableModelVersion()) + m.UnlockModel(v.Name) // send an event to progress the deletion evts = append( evts, coordinator.ModelEventMsg{ - ModelName: modelVersion.GetMeta().GetName(), + ModelName: modelVersion.ModelDefn.Meta.Name, ModelVersion: modelVersion.GetVersion(), }, ) } else { - modelNames = append(modelNames, modelVersionID.Name) + modelNames = append(modelNames, v.Name) } } else { - logger.Warnf("Can't find model version %s", modelVersionID.String()) + logger.Warnf("Can't find model version %s", v.String()) } } } @@ -888,22 +890,22 @@ func (m *MemoryStore) drainServerReplicaImpl(serverName string, replicaIdx int) return append(loadedModels, loadingModels...), nil } -func (m *MemoryStore) findModelsToReSchedule(models map[ModelVersionID]bool, replicaIdx int) []string { +func (m *MemoryStore) findModelsToReSchedule(models []*db.ModelVersionID, replicaIdx int) []string { logger := m.logger.WithField("func", "DrainServerReplica") modelsReSchedule := make([]string, 0) - for modelVersionID := range models { + for _, v := range models { // TODO pointer issue // TODO handle error? - model, err := m.store.GetModel(modelVersionID.Name) + model, err := m.store.GetModel(v.Name) if err == nil { - modelVersion := model.GetVersion(modelVersionID.Version) + modelVersion := model.GetVersion(v.Version) if modelVersion != nil { - modelVersion.SetReplicaState(replicaIdx, Draining, "trigger to drain") - modelsReSchedule = append(modelsReSchedule, modelVersionID.Name) + modelVersion.SetReplicaState(replicaIdx, db.ModelReplicaState_MODEL_REPLICA_STATE_DRAINING, "trigger to drain") + modelsReSchedule = append(modelsReSchedule, model.Name) continue } - logger.Warnf("Can't find model version %s", modelVersionID.String()) + logger.Warnf("Can't find model version %s", model.String()) } } @@ -920,17 +922,28 @@ func (m *MemoryStore) ServerNotify(request *pb.ServerNotify) error { server, err := m.store.GetServer(request.Name) if err != nil { if !errors.Is(err, ErrNotFound) { - return err + return fmt.Errorf("failed to find server %s: %w", request.Name, err) } server = NewServer(request.Name, request.Shared) + server.ExpectedReplicas = int32(request.ExpectedReplicas) + server.MinReplicas = int32(request.MinReplicas) + server.MaxReplicas = int32(request.MaxReplicas) + server.KubernetesMeta = request.KubernetesMeta if err := m.store.AddServer(server); err != nil { return err } + return nil } - server.SetExpectedReplicas(int(request.ExpectedReplicas)) - server.SetMinReplicas(int(request.MinReplicas)) - server.SetMaxReplicas(int(request.MaxReplicas)) - server.SetKubernetesMeta(request.KubernetesMeta) + + server.ExpectedReplicas = int32(request.ExpectedReplicas) + server.MinReplicas = int32(request.MinReplicas) + server.MaxReplicas = int32(request.MaxReplicas) + server.KubernetesMeta = request.KubernetesMeta + + if err := m.store.UpdateServer(server); err != nil { + return fmt.Errorf("failed to update server %s: %w", request.Name, err) + } + return nil } @@ -960,21 +973,20 @@ func (m *MemoryStore) maxNumModelReplicasForServer(serverName string) (uint32, e maxNumModels := uint32(0) for _, model := range models { latest := model.Latest() - if latest != nil && latest.Server() == serverName { + if latest != nil && latest.Server == serverName { maxNumModels = max(maxNumModels, uint32(latest.DesiredReplicas())) } } return maxNumModels, nil } -func toSchedulerLoadedModels(agentLoadedModels []*agent.ModelVersion) map[ModelVersionID]bool { - loadedModels := make(map[ModelVersionID]bool) +func toSchedulerLoadedModels(agentLoadedModels []*agent.ModelVersion) []*db.ModelVersionID { + loadedModels := make([]*db.ModelVersionID, 0) for _, modelVersionReq := range agentLoadedModels { - key := ModelVersionID{ + loadedModels = append(loadedModels, &db.ModelVersionID{ Name: modelVersionReq.GetModel().GetMeta().GetName(), Version: modelVersionReq.GetVersion(), - } - loadedModels[key] = true + }) } return loadedModels } @@ -997,7 +1009,7 @@ func (m *MemoryStore) SetModelGwModelState(name string, versionNumber uint32, st return nil } -func (m *MemoryStore) setModelGwModelStateImpl(name string, versionNumber uint32, status ModelState, reason, source string) ([]*coordinator.ModelEventMsg, error) { +func (m *MemoryStore) setModelGwModelStateImpl(name string, versionNumber uint32, status db.ModelState, reason, source string) ([]*coordinator.ModelEventMsg, error) { var evts []*coordinator.ModelEventMsg m.mu.Lock() @@ -1013,11 +1025,11 @@ func (m *MemoryStore) setModelGwModelStateImpl(name string, versionNumber uint32 return nil, fmt.Errorf("version not found for model %s, version %d", name, versionNumber) } - if modelVersion.state.ModelGwState != status || modelVersion.state.ModelGwReason != reason { - modelVersion.state.ModelGwState = status - modelVersion.state.ModelGwReason = reason + if modelVersion.State.ModelGwState != status || modelVersion.State.ModelGwReason != reason { + modelVersion.State.ModelGwState = status + modelVersion.State.ModelGwReason = reason evt := &coordinator.ModelEventMsg{ - ModelName: modelVersion.GetMeta().GetName(), + ModelName: modelVersion.ModelDefn.Meta.Name, ModelVersion: modelVersion.GetVersion(), Source: source, } @@ -1033,9 +1045,9 @@ func (m *MemoryStore) EmitEvents() error { } for _, server := range servers { - for id := range server.replicas { + for id := range server.Replicas { m.eventHub.PublishServerEvent(serverUpdateEventSource, coordinator.ServerEventMsg{ - ServerName: server.name, + ServerName: server.Name, ServerIdx: uint32(id), Source: serverUpdateEventSource, UpdateContext: coordinator.SERVER_REPLICA_CONNECTED, // TODO can we be confident of that? @@ -1049,10 +1061,15 @@ func (m *MemoryStore) EmitEvents() error { } for _, model := range models { + latest := model.GetLastAvailableModelVersion() + if latest == nil { + continue + } + m.eventHub.PublishModelEvent(modelUpdateEventSource, coordinator.ModelEventMsg{ - ModelName: model.Name(), + ModelName: model.Name, Source: modelUpdateEventSource, - ModelVersion: model.GetLastAvailableModelVersion().version, + ModelVersion: latest.Version, }) } diff --git a/scheduler/pkg/store/memory_status.go b/scheduler/pkg/store/memory_status.go index 989caef9d6..a996bfe7c0 100644 --- a/scheduler/pkg/store/memory_status.go +++ b/scheduler/pkg/store/memory_status.go @@ -37,76 +37,84 @@ type modelVersionStateStatistics struct { lastFailedReason string } -func calcModelVersionStatistics(modelVersion *ModelVersion, deleted bool) *modelVersionStateStatistics { +func calcModelVersionStatistics(modelVersion *db.ModelVersion, deleted bool) *modelVersionStateStatistics { s := modelVersionStateStatistics{} - for _, replicaState := range modelVersion.ReplicaState() { + + for _, replicaState := range modelVersion.Replicas { switch replicaState.State { - case Available: + case db.ModelReplicaState_MODEL_REPLICA_STATE_AVAILABLE: s.replicasAvailable++ - case LoadRequested, Loading, Loaded: // unavailable but OK + case db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_REQUESTED, + db.ModelReplicaState_MODEL_REPLICA_STATE_LOADING, db.ModelReplicaState_MODEL_REPLICA_STATE_LOADED: // unavailable but OK s.replicasLoading++ - case LoadFailed, LoadedUnavailable: // unavailable but not OK + case db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_FAILED, db.ModelReplicaState_MODEL_REPLICA_STATE_LOADED_UNAVAILABLE: // unavailable but not OK s.replicasLoadFailed++ - if !deleted && replicaState.Timestamp.After(s.lastFailedStateTime) { - s.lastFailedStateTime = replicaState.Timestamp + if !deleted && replicaState.Timestamp.AsTime().After(s.lastFailedStateTime) { + s.lastFailedStateTime = replicaState.Timestamp.AsTime() s.lastFailedReason = replicaState.Reason } - case UnloadEnvoyRequested, UnloadRequested, Unloading: + case db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_ENVOY_REQUESTED, + db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_REQUESTED, db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOADING: s.replicasUnloading++ - case Unloaded: + case db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOADED: s.replicasUnloaded++ - case UnloadFailed: + case db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_FAILED: s.replicasUnloadFailed++ - if deleted && replicaState.Timestamp.After(s.lastFailedStateTime) { - s.lastFailedStateTime = replicaState.Timestamp + if deleted && replicaState.Timestamp.AsTime().After(s.lastFailedStateTime) { + s.lastFailedStateTime = replicaState.Timestamp.AsTime() s.lastFailedReason = replicaState.Reason } - case Draining: + case db.ModelReplicaState_MODEL_REPLICA_STATE_DRAINING: s.replicasDraining++ } - if replicaState.Timestamp.After(s.latestTime) { - s.latestTime = replicaState.Timestamp + if replicaState.Timestamp.AsTime().After(s.latestTime) { + s.latestTime = replicaState.Timestamp.AsTime() } } return &s } -func updateModelState(isLatest bool, modelVersion *ModelVersion, prevModelVersion *ModelVersion, stats *modelVersionStateStatistics, deleted bool) { - var modelState ModelState +func updateModelState(isLatest bool, modelVersion *db.ModelVersion, prevModelVersion *db.ModelVersion, stats *modelVersionStateStatistics, deleted bool) { + var modelState db.ModelState var modelReason string + modelTimestamp := stats.latestTime if deleted || !isLatest { if stats.replicasUnloadFailed > 0 { - modelState = ModelTerminateFailed + modelState = db.ModelState_MODEL_STATE_TERMINATED modelReason = stats.lastFailedReason modelTimestamp = stats.lastFailedStateTime } else if stats.replicasUnloading > 0 || stats.replicasAvailable > 0 || stats.replicasLoading > 0 { - modelState = ModelTerminating + modelState = db.ModelState_MODEL_STATE_TERMINATING } else { - modelState = ModelTerminated + modelState = db.ModelState_MODEL_STATE_TERMINATED } } else { if stats.replicasLoadFailed > 0 { - modelState = ModelFailed + modelState = db.ModelState_MODEL_STATE_FAILED modelReason = stats.lastFailedReason modelTimestamp = stats.lastFailedStateTime - } else if modelVersion.GetDeploymentSpec() != nil && stats.replicasAvailable == 0 && modelVersion.GetDeploymentSpec().Replicas == 0 && modelVersion.GetDeploymentSpec().MinReplicas == 0 { - modelState = ModelScaledDown - } else if (modelVersion.GetDeploymentSpec() != nil && stats.replicasAvailable == modelVersion.GetDeploymentSpec().Replicas) || // equal to desired replicas - (modelVersion.GetDeploymentSpec() != nil && stats.replicasAvailable >= modelVersion.GetDeploymentSpec().MinReplicas && modelVersion.GetDeploymentSpec().MinReplicas > 0) || // min replicas is set and available replicas are greater than or equal to min replicas - (stats.replicasAvailable > 0 && prevModelVersion != nil && modelVersion != prevModelVersion && prevModelVersion.state.State == ModelAvailable) { - modelState = ModelAvailable + } else if modelVersion.ModelDefn.DeploymentSpec != nil && stats.replicasAvailable == 0 && + modelVersion.ModelDefn.DeploymentSpec.Replicas == 0 && modelVersion.ModelDefn.DeploymentSpec.MinReplicas == 0 { + modelState = db.ModelState_MODEL_STATE_SCALED_DOWN + } else if (modelVersion.ModelDefn.DeploymentSpec != nil && + stats.replicasAvailable == modelVersion.ModelDefn.DeploymentSpec.Replicas) || // equal to desired replicas + (modelVersion.ModelDefn.DeploymentSpec != nil && stats.replicasAvailable >= modelVersion.ModelDefn.DeploymentSpec.MinReplicas && + modelVersion.ModelDefn.DeploymentSpec.MinReplicas > 0) || // min replicas is set and available replicas are greater than or equal to min replicas + (stats.replicasAvailable > 0 && prevModelVersion != nil && modelVersion != prevModelVersion && + prevModelVersion.State.State == db.ModelState_MODEL_STATE_AVAILABLE) { + modelState = db.ModelState_MODEL_STATE_AVAILABLE } else { - modelState = ModelProgressing + modelState = db.ModelState_MODEL_STATE_PROGRESSING } } - modelVersion.state = ModelStatus{ + modelVersion.State = &db.ModelStatus{ State: modelState, - ModelGwState: modelVersion.state.ModelGwState, + ModelGwState: modelVersion.State.ModelGwState, Reason: modelReason, - ModelGwReason: modelVersion.state.ModelGwReason, - Timestamp: modelTimestamp, + ModelGwReason: modelVersion.State.ModelGwReason, + Timestamp: timestamppb.New(modelTimestamp), AvailableReplicas: stats.replicasAvailable, UnavailableReplicas: stats.replicasLoading + stats.replicasLoadFailed, DrainingReplicas: stats.replicasDraining, @@ -162,7 +170,7 @@ func (m *MemoryStore) FailedScheduling(modelID string, version uint32, reason st return fmt.Errorf("model %s found, version %d not found", modelID, version) } -func (m *MemoryStore) updateModelStatus(isLatest bool, deleted bool, modelVersion *ModelVersion, prevModelVersion *ModelVersion) { +func (m *MemoryStore) updateModelStatus(isLatest bool, deleted bool, modelVersion *db.ModelVersion, prevModelVersion *db.ModelVersion) { logger := m.logger.WithField("func", "updateModelStatus") stats := calcModelVersionStatistics(modelVersion, deleted) logger.Debugf("Stats %+v modelVersion %+v prev model %+v", stats, modelVersion, prevModelVersion) diff --git a/scheduler/pkg/store/mesh.go b/scheduler/pkg/store/mesh.go index 80edf3e399..a71a6aa6fb 100644 --- a/scheduler/pkg/store/mesh.go +++ b/scheduler/pkg/store/mesh.go @@ -83,7 +83,7 @@ func NewDefaultModelVersion(model *pb.Model, version uint32) *db.ModelVersion { return &db.ModelVersion{ Version: version, ModelDefn: model, - Replicas: make([]*db.ReplicaStatusEntry, 0), + Replicas: make(map[int32]*db.ReplicaStatus, 0), State: &db.ModelStatus{ State: db.ModelState_MODEL_STATE_UNKNOWN, ModelGwState: db.ModelState_MODEL_STATE_CREATE, @@ -154,12 +154,12 @@ func (s *Server) SetKubernetesMeta(meta *pb.KubernetesMeta) { s.kubernetesMeta = meta } -func NewServer(name string, shared bool) *Server { - return &Server{ - name: name, - replicas: make(map[int]*ServerReplica), - shared: shared, - expectedReplicas: -1, +func NewServer(name string, shared bool) *db.Server { + return &db.Server{ + Name: name, + Replicas: make(map[int32]*db.ServerReplica), + Shared: shared, + ExpectedReplicas: -1, } } @@ -212,26 +212,26 @@ func NewServerReplica(inferenceSvc string, loadedModels: loadedModels, loadingModels: map[ModelVersionID]bool{}, overCommitPercentage: overCommitPercentage, - uniqueLoadedModels: toUniqueModels(loadedModels), - isDraining: false, + //uniqueLoadedModels: toUniqueModels(loadedModels), + isDraining: false, } } -func NewServerReplicaFromConfig(server *Server, replicaIdx int, loadedModels map[ModelVersionID]bool, config *pba.ReplicaConfig, availableMemoryBytes uint64) *ServerReplica { - return &ServerReplica{ - inferenceSvc: config.GetInferenceSvc(), - inferenceHttpPort: config.GetInferenceHttpPort(), - inferenceGrpcPort: config.GetInferenceGrpcPort(), - serverName: server.name, - replicaIdx: replicaIdx, - capabilities: cleanCapabilities(config.GetCapabilities()), - memory: config.GetMemoryBytes(), - availableMemory: availableMemoryBytes, - loadedModels: loadedModels, - loadingModels: map[ModelVersionID]bool{}, - overCommitPercentage: config.GetOverCommitPercentage(), - uniqueLoadedModels: toUniqueModels(loadedModels), - isDraining: false, +func NewServerReplicaFromConfig(server *db.Server, replicaIdx int, loadedModels []*db.ModelVersionID, config *pba.ReplicaConfig, availableMemoryBytes uint64) *db.ServerReplica { + return &db.ServerReplica{ + InferenceSvc: config.GetInferenceSvc(), + InferenceHttpPort: config.GetInferenceHttpPort(), + InferenceGrpcPort: config.GetInferenceGrpcPort(), + ServerName: server.Name, + ReplicaIdx: int32(replicaIdx), + Capabilities: cleanCapabilities(config.GetCapabilities()), + Memory: config.GetMemoryBytes(), + AvailableMemory: availableMemoryBytes, + LoadedModels: loadedModels, + LoadingModels: make([]*db.ModelVersionID, 0), + OverCommitPercentage: config.GetOverCommitPercentage(), + UniqueLoadedModels: toUniqueModels(loadedModels), + IsDraining: false, } } @@ -816,9 +816,9 @@ func (s *ServerReplica) deleteModelVersion(modelName string, modelVersion uint32 } } -func toUniqueModels(loadedModels map[ModelVersionID]bool) map[string]bool { +func toUniqueModels(loadedModels []*db.ModelVersionID) map[string]bool { uniqueModels := make(map[string]bool) - for key := range loadedModels { + for _, key := range loadedModels { uniqueModels[key.Name] = true } return uniqueModels diff --git a/scheduler/pkg/store/pipeline/status.go b/scheduler/pkg/store/pipeline/status.go index 5684a09c08..784ac31b8a 100644 --- a/scheduler/pkg/store/pipeline/status.go +++ b/scheduler/pkg/store/pipeline/status.go @@ -12,6 +12,7 @@ package pipeline import ( "sync" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/sirupsen/logrus" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" @@ -73,8 +74,8 @@ func (ms *ModelStatusHandler) setPipelineModelsReady(pipelineVersion *PipelineVe } step.Available = false if model != nil { - lastAvailableModelVersion := model.GetLastAvailableModel() - if lastAvailableModelVersion != nil && lastAvailableModelVersion.ModelState().ModelGwState == store.ModelAvailable { + lastAvailableModelVersion := model.GetLastAvailableModelVersion() + if lastAvailableModelVersion != nil && lastAvailableModelVersion.State.ModelGwState == db.ModelState_MODEL_STATE_AVAILABLE { step.Available = true } } diff --git a/scheduler/pkg/store/pipeline/store.go b/scheduler/pkg/store/pipeline/store.go index 408445cb7d..cebf2557b7 100644 --- a/scheduler/pkg/store/pipeline/store.go +++ b/scheduler/pkg/store/pipeline/store.go @@ -17,6 +17,7 @@ import ( "time" "github.com/mitchellh/copystructure" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/sirupsen/logrus" "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" @@ -610,8 +611,8 @@ func (ps *PipelineStore) handleModelEvents(event coordinator.ModelEventMsg) { } ps.mu.Lock() - modelVersion := model.GetLastAvailableModel() - modelAvailable := model != nil && modelVersion != nil && modelVersion.ModelState().ModelGwState == store.ModelAvailable + modelVersion := model.GetLastAvailableModelVersion() + modelAvailable := model != nil && modelVersion != nil && modelVersion.State.ModelGwState == db.ModelState_MODEL_STATE_AVAILABLE evts := updatePipelinesFromModelAvailability(refs, event.ModelName, modelAvailable, ps.pipelines, ps.logger) ps.mu.Unlock() diff --git a/scheduler/pkg/store/pipeline/utils.go b/scheduler/pkg/store/pipeline/utils.go index e23b01f93b..d39dfba5c3 100644 --- a/scheduler/pkg/store/pipeline/utils.go +++ b/scheduler/pkg/store/pipeline/utils.go @@ -15,6 +15,7 @@ import ( "strings" "github.com/rs/xid" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "google.golang.org/protobuf/types/known/timestamppb" "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" @@ -311,12 +312,12 @@ func CreatePipelineVersionWithStateFromProto(pvs *scheduler.PipelineWithState) ( return pv, nil } -func CreatePipelineSnapshotFromPipeline(pipeline *Pipeline) *scheduler.PipelineSnapshot { +func CreatePipelineSnapshotFromPipeline(pipeline *Pipeline) *db.PipelineSnapshot { var versions []*scheduler.PipelineWithState for _, pv := range pipeline.Versions { versions = append(versions, CreatePipelineWithState(pv)) } - return &scheduler.PipelineSnapshot{ + return &db.PipelineSnapshot{ Name: pipeline.Name, LastVersion: pipeline.LastVersion, Versions: versions, @@ -324,7 +325,7 @@ func CreatePipelineSnapshotFromPipeline(pipeline *Pipeline) *scheduler.PipelineS } } -func CreatePipelineFromSnapshot(snapshot *scheduler.PipelineSnapshot) (*Pipeline, error) { +func CreatePipelineFromSnapshot(snapshot *db.PipelineSnapshot) (*Pipeline, error) { var versions []*PipelineVersion for _, ver := range snapshot.Versions { pv, err := CreatePipelineVersionWithStateFromProto(ver) diff --git a/scheduler/pkg/store/store.go b/scheduler/pkg/store/store.go index 07b6d843d3..cc141996b3 100644 --- a/scheduler/pkg/store/store.go +++ b/scheduler/pkg/store/store.go @@ -145,7 +145,7 @@ type ModelStore interface { UnlockModel(modelId string) RemoveModel(req *pb.UnloadModelRequest) error GetServers(shallow bool, modelDetails bool) ([]*db.Server, error) - GetServer(serverKey string, shallow bool, modelDetails bool) (*db.Server, error) + GetServer(serverKey string, shallow bool, modelDetails bool) (*db.Server, *ServerStats, error) UpdateLoadedModels(modelKey string, version uint32, serverKey string, replicas []*db.ServerReplica) error UnloadVersionModels(modelKey string, version uint32) (bool, error) UnloadModelGwVersionModels(modelKey string, version uint32) (bool, error) From 01a7d8ca190b71334b1230297ef5a21713e92b62 Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Tue, 30 Dec 2025 20:41:25 +0000 Subject: [PATCH 06/50] use generics --- scheduler/pkg/store/in-memory/store.go | 129 ++++++------------------- scheduler/pkg/store/storage.go | 21 ++-- 2 files changed, 39 insertions(+), 111 deletions(-) diff --git a/scheduler/pkg/store/in-memory/store.go b/scheduler/pkg/store/in-memory/store.go index 8d9bd0c3fa..0b06428b19 100644 --- a/scheduler/pkg/store/in-memory/store.go +++ b/scheduler/pkg/store/in-memory/store.go @@ -9,147 +9,80 @@ import ( "google.golang.org/protobuf/proto" ) -type Storage struct { +type Storage[T interface { + proto.Message + GetName() string +}] struct { mu sync.RWMutex - servers map[string]*db.Server - models map[string]*db.Model + records map[string]T } -var _ store.Storage = &Storage{} +var _ store.Storage[*db.Model] = &Storage[*db.Model]{} +var _ store.Storage[*db.Server] = &Storage[*db.Server]{} -func NewStorage() *Storage { - return &Storage{ - servers: make(map[string]*db.Server), - models: make(map[string]*db.Model), - } -} - -// GetModel retrieves a model by name, returning a clone to prevent external modification -func (s *Storage) GetModel(name string) (*db.Model, error) { - s.mu.RLock() - defer s.mu.RUnlock() - - model, ok := s.models[name] - if !ok { - return nil, store.ErrNotFound - } - - return proto.Clone(model).(*db.Model), nil -} - -// AddModel adds a new model, storing a clone to prevent external modification -func (s *Storage) AddModel(model *db.Model) error { - s.mu.Lock() - defer s.mu.Unlock() - - if _, ok := s.models[model.Name]; ok { - return fmt.Errorf("model with name %s already exists", model.Name) - } - - s.models[model.Name] = proto.Clone(model).(*db.Model) - return nil -} - -// ListModels returns all models, cloning each to prevent external modification -func (s *Storage) ListModels() ([]*db.Model, error) { - s.mu.RLock() - defer s.mu.RUnlock() - - models := make([]*db.Model, 0, len(s.models)) - for _, model := range s.models { - models = append(models, proto.Clone(model).(*db.Model)) - } - - return models, nil -} - -// UpdateModel updates an existing model, storing a clone to prevent external modification -func (s *Storage) UpdateModel(model *db.Model) error { - s.mu.Lock() - defer s.mu.Unlock() - - if _, ok := s.models[model.Name]; !ok { - return store.ErrNotFound - } - - s.models[model.Name] = proto.Clone(model).(*db.Model) - return nil -} - -// DeleteModel removes a model by name -func (s *Storage) DeleteModel(name string) error { - s.mu.Lock() - defer s.mu.Unlock() - - if _, ok := s.models[name]; !ok { - return store.ErrNotFound - } - - delete(s.models, name) - return nil +func NewStorage[T interface { + proto.Message + GetName() string +}]() *Storage[T] { + return &Storage[T]{} } -// GetServer retrieves a server by name, returning a clone to prevent external modification -func (s *Storage) GetServer(name string) (*db.Server, error) { +func (s *Storage[T]) Get(id string) (T, error) { s.mu.RLock() defer s.mu.RUnlock() - server, ok := s.servers[name] + record, ok := s.records[id] if !ok { - return nil, store.ErrNotFound + return *new(T), store.ErrNotFound } - return proto.Clone(server).(*db.Server), nil + return proto.Clone(record).(T), nil } -// AddServer adds a new server, storing a clone to prevent external modification -func (s *Storage) AddServer(server *db.Server) error { +func (s *Storage[T]) Insert(record T) error { s.mu.Lock() defer s.mu.Unlock() - if _, ok := s.servers[server.Name]; ok { - return fmt.Errorf("server with name %s already exists", server.Name) + if _, ok := s.records[record.GetName()]; ok { + return fmt.Errorf("record with name %s already exists", record.GetName()) } - s.servers[server.Name] = proto.Clone(server).(*db.Server) + s.records[record.GetName()] = proto.Clone(record).(T) return nil } -// ListServers returns all servers, cloning each to prevent external modification -func (s *Storage) ListServers() ([]*db.Server, error) { +func (s *Storage[T]) List() ([]T, error) { s.mu.RLock() defer s.mu.RUnlock() - servers := make([]*db.Server, 0, len(s.servers)) - for _, server := range s.servers { - servers = append(servers, proto.Clone(server).(*db.Server)) + records := make([]T, 0, len(s.records)) + for _, record := range s.records { + records = append(records, proto.Clone(record).(T)) } - return servers, nil + return records, nil } -// UpdateServer updates an existing server, storing a clone to prevent external modification -func (s *Storage) UpdateServer(server *db.Server) error { +func (s *Storage[T]) Update(record T) error { s.mu.Lock() defer s.mu.Unlock() - if _, ok := s.servers[server.Name]; !ok { + if _, ok := s.records[record.GetName()]; !ok { return store.ErrNotFound } - s.servers[server.Name] = proto.Clone(server).(*db.Server) + s.records[record.GetName()] = proto.Clone(record).(T) return nil } -// DeleteServer removes a server by name -func (s *Storage) DeleteServer(serverName string) error { +func (s *Storage[T]) Delete(id string) error { s.mu.Lock() defer s.mu.Unlock() - if _, ok := s.servers[serverName]; !ok { + if _, ok := s.records[id]; !ok { return store.ErrNotFound } - delete(s.servers, serverName) + delete(s.records, id) return nil } diff --git a/scheduler/pkg/store/storage.go b/scheduler/pkg/store/storage.go index 95ccd197e4..ba1fb3d522 100644 --- a/scheduler/pkg/store/storage.go +++ b/scheduler/pkg/store/storage.go @@ -11,22 +11,17 @@ package store import ( "errors" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + "google.golang.org/protobuf/proto" ) var ( - ErrNotFound = errors.New("not found") + ErrNotFound = errors.New("record not found") ) -type Storage interface { - GetModel(name string) (*db.Model, error) - AddModel(model *db.Model) error - ListModels() ([]*db.Model, error) - UpdateModel(model *db.Model) error - DeleteModel(name string) error - GetServer(name string) (*db.Server, error) - AddServer(server *db.Server) error - ListServers() ([]*db.Server, error) - UpdateServer(server *db.Server) error - DeleteServer(serverName string) error +type Storage[T proto.Message] interface { + Get(name string) (T, error) + Insert(record T) error + List() ([]T, error) + Update(record T) error + Delete(name string) error } From 0953f0323d73efeda262ef9f209674a8f9acf77f Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Fri, 2 Jan 2026 14:56:40 +0000 Subject: [PATCH 07/50] update to use new proto definitions for models/servers --- apis/go/mlops/scheduler/db/model_ext.go | 120 +++++++++++++ apis/go/mlops/scheduler/db/server_ext.go | 4 + scheduler/cmd/scheduler/main.go | 3 +- scheduler/pkg/agent/server.go | 69 +++---- scheduler/pkg/agent/server_test.go | 2 +- scheduler/pkg/envoy/processor/incremental.go | 49 ++--- scheduler/pkg/envoy/xdscache/seldoncache.go | 6 +- .../conflict_resolution.go | 36 ++-- .../scheduler/cleaner/test_versions_hack.go | 2 +- scheduler/pkg/scheduler/cleaner/versions.go | 15 +- .../pkg/scheduler/filters/deletedserver.go | 6 +- scheduler/pkg/scheduler/filters/explainer.go | 10 +- scheduler/pkg/scheduler/filters/interface.go | 12 +- .../pkg/scheduler/filters/replicadraining.go | 6 +- .../pkg/scheduler/filters/replicamemory.go | 10 +- .../pkg/scheduler/filters/serverreplicas.go | 6 +- .../scheduler/filters/serverrequirements.go | 8 +- scheduler/pkg/scheduler/filters/sharing.go | 6 +- scheduler/pkg/scheduler/scheduler.go | 67 +++---- scheduler/pkg/scheduler/scheduler_test.go | 2 +- scheduler/pkg/scheduler/sorters/interface.go | 16 +- scheduler/pkg/scheduler/sorters/loaded.go | 6 +- scheduler/pkg/server/server.go | 45 +++-- scheduler/pkg/server/server_status.go | 150 +++++++++------- scheduler/pkg/server/utils.go | 12 +- scheduler/pkg/store/{store.go => api.go} | 5 +- scheduler/pkg/store/experiment/store.go | 4 +- scheduler/pkg/store/experiment/store_test.go | 2 +- scheduler/pkg/store/memory.go | 87 ++++----- scheduler/pkg/store/memory_status.go | 8 +- scheduler/pkg/store/mock/store.go | 169 +++++++++--------- scheduler/pkg/store/pipeline/db.go | 6 +- scheduler/pkg/store/pipeline/status.go | 2 +- scheduler/pkg/store/pipeline/status_test.go | 2 +- scheduler/pkg/store/pipeline/store.go | 2 +- scheduler/pkg/store/test_memory_hack.go | 19 +- 36 files changed, 564 insertions(+), 410 deletions(-) rename scheduler/pkg/store/{store.go => api.go} (95%) diff --git a/apis/go/mlops/scheduler/db/model_ext.go b/apis/go/mlops/scheduler/db/model_ext.go index 7baae1b47d..5f05b89b10 100644 --- a/apis/go/mlops/scheduler/db/model_ext.go +++ b/apis/go/mlops/scheduler/db/model_ext.go @@ -30,10 +30,51 @@ func (m *ModelVersion) GetAssignment() []int { return nil } +func (m *ModelVersion) ReplicaState() map[int]*ReplicaStatus { + copyReplicas := make(map[int]*ReplicaStatus, len(m.Replicas)) + for idx, r := range m.Replicas { + copyReplicas[int(idx)] = r + } + return copyReplicas +} + +func (m *ModelVersion) GetRequestedServer() *string { + return m.ModelDefn.GetModelSpec().Server +} + +func (m *ModelVersion) GetRequirements() []string { + return m.ModelDefn.GetModelSpec().GetRequirements() +} + +func (m *ModelVersion) IsLoadingOrLoaded(server string, replicaIdx int) bool { + if server != m.Server { + return false + } + for r, v := range m.Replicas { + if int(r) == replicaIdx && v.State.IsLoadingOrLoaded() { + return true + } + } + return false +} + func (m *ModelVersion) DesiredReplicas() int { return int(m.ModelDefn.DeploymentSpec.Replicas) } +func (m *ModelVersion) ModelName() string { + return m.ModelDefn.GetMeta().GetName() +} + +func (m *ModelVersion) IsLoadingOrLoadedOnServer() bool { + for _, v := range m.Replicas { + if v.State.AlreadyLoadingOrLoaded() { + return true + } + } + return false +} + func (m *ModelVersion) GetReplicaForState(state ModelReplicaState) []int { var assignment []int for k, v := range m.Replicas { @@ -44,6 +85,85 @@ func (m *ModelVersion) GetReplicaForState(state ModelReplicaState) []int { return assignment } +func (m *Model) GetLastAvailableModel() *ModelVersion { + if m == nil { // TODO Make safe by not working on actual object + return nil + } + lastAvailableIdx := m.getLastAvailableModelIdx() + if lastAvailableIdx != -1 { + return m.Versions[lastAvailableIdx] + } + return nil +} + +func (m *Model) CanReceiveTraffic() bool { + if m.GetLastAvailableModel() != nil { + return true + } + latestVersion := m.Latest() + if latestVersion != nil && latestVersion.HasLiveReplicas() { + return true + } + return false +} + +func (m *Model) getLastModelGwAvailableModelIdx() int { + if m == nil { // TODO Make safe by not working on actual object + return -1 + } + lastAvailableIdx := -1 + for idx, mv := range m.Versions { + if mv.State.ModelGwState == ModelState_MODEL_STATE_AVAILABLE { + lastAvailableIdx = idx + } + } + return lastAvailableIdx +} + +func (m *Model) GetVersionsBeforeLastModelGwAvailable() []*ModelVersion { + if m == nil { // TODO Make safe by not working on actual object + return nil + } + lastAvailableIdx := m.getLastModelGwAvailableModelIdx() + if lastAvailableIdx != -1 { + return m.Versions[0:lastAvailableIdx] + } + return nil +} + +func (m *Model) getLastAvailableModelIdx() int { + if m == nil { // TODO Make safe by not working on actual object + return -1 + } + lastAvailableIdx := -1 + for idx, mv := range m.Versions { + if mv.State.State == ModelState_MODEL_STATE_AVAILABLE { + lastAvailableIdx = idx + } + } + return lastAvailableIdx +} + +func (m *Model) GetVersionsBeforeLastAvailable() []*ModelVersion { + if m == nil { + return nil + } + lastAvailableIdx := m.getLastAvailableModelIdx() + if lastAvailableIdx != -1 { + return m.Versions[0:lastAvailableIdx] + } + return nil +} + +func (m *ModelVersion) HasLiveReplicas() bool { + for _, v := range m.Replicas { + if v.State.CanReceiveTraffic() { + return true + } + } + return false +} + func (m *ModelVersion) HasServer() bool { return m.Server != "" } diff --git a/apis/go/mlops/scheduler/db/server_ext.go b/apis/go/mlops/scheduler/db/server_ext.go index 58910f96f0..0e9854a940 100644 --- a/apis/go/mlops/scheduler/db/server_ext.go +++ b/apis/go/mlops/scheduler/db/server_ext.go @@ -23,6 +23,10 @@ func (s *ServerReplica) UpdateReservedMemory(memBytes uint64, isAdd bool) { s.ReservedMemory -= memBytes } +func (s *ServerReplica) GetNumLoadedModels() int { + return len(s.UniqueLoadedModels) +} + func (s *ServerReplica) AddModelVersion(modelName string, modelVersion uint32, replicaState ModelReplicaState) { mvID := &ModelVersionID{ Name: modelName, diff --git a/scheduler/cmd/scheduler/main.go b/scheduler/cmd/scheduler/main.go index fd28d90904..3ebdfb3ea9 100644 --- a/scheduler/cmd/scheduler/main.go +++ b/scheduler/cmd/scheduler/main.go @@ -23,6 +23,7 @@ import ( "syscall" "time" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" in_memory "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/in-memory" log "github.com/sirupsen/logrus" "google.golang.org/grpc" @@ -299,7 +300,7 @@ func main() { } // Create stores - ss := store.NewMemoryStore(logger, in_memory.NewStorage(), eventHub) + ss := store.NewMemoryStore(logger, in_memory.NewStorage[*db.Model](), in_memory.NewStorage[*db.Server](), eventHub) ps := pipeline.NewPipelineStore(logger, eventHub, ss) es := experiment.NewExperimentServer(logger, eventHub, ss, ps) cleaner := cleaner.NewVersionCleaner(ss, logger) diff --git a/scheduler/pkg/agent/server.go b/scheduler/pkg/agent/server.go index 31338d6b2d..2d6c76c0e2 100644 --- a/scheduler/pkg/agent/server.go +++ b/scheduler/pkg/agent/server.go @@ -112,7 +112,7 @@ type Server struct { health.UnimplementedHealthCheckServiceServer logger log.FieldLogger agents map[ServerKey]*AgentSubscriber - store store.ModelStore + store store.ModelServerAPI scheduler scheduler.Scheduler waiter *modelRelocatedWaiter // waiter for when we want to drain a particular server replica autoscalingModelEnabled bool @@ -152,7 +152,7 @@ func (a *AgentSubscriber) Send(message *pb.ModelOperationMessage) error { func NewAgentServer( logger log.FieldLogger, - store store.ModelStore, + store store.ModelServerAPI, scheduler scheduler.Scheduler, hub *coordinator.EventHub, autoscalingModelEnabled bool, @@ -281,17 +281,17 @@ func (s *Server) Sync(modelName string) { // Handle any load requests for latest version - we don't want to load models from older versions if latestModel != nil { - for _, replicaIdx := range latestModel.GetReplicaForState(store.LoadRequested) { + for _, replicaIdx := range latestModel.GetReplicaForState(db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_REQUESTED) { logger.Infof("Sending load model request for %s", modelName) - as, ok := s.agents[ServerKey{serverName: latestModel.Server(), replicaIdx: uint32(replicaIdx)}] + as, ok := s.agents[ServerKey{serverName: latestModel.Server, replicaIdx: uint32(replicaIdx)}] if !ok { - logger.Errorf("Failed to find server replica for %s:%d", latestModel.Server(), replicaIdx) + logger.Errorf("Failed to find server replica for %s:%d", latestModel.Server, replicaIdx) continue } - model := latestModel.GetModel() + model := latestModel.ModelDefn err = as.Send(&pb.ModelOperationMessage{ Operation: pb.ModelOperationMessage_LOAD_MODEL, ModelVersion: &pb.ModelVersion{Model: model, Version: latestModel.GetVersion()}, @@ -301,13 +301,15 @@ func (s *Server) Sync(modelName string) { if err != nil { logger.WithError(err).Errorf("stream message send failed for model %s and replicaidx %d", modelName, replicaIdx) if errState := s.store.UpdateModelState( - latestModel.Key(), latestModel.GetVersion(), latestModel.Server(), replicaIdx, nil, - store.LoadRequested, store.LoadFailed, err.Error(), nil); errState != nil { + latestModel.ModelName(), latestModel.GetVersion(), latestModel.Server, replicaIdx, nil, + db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_REQUESTED, db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_FAILED, err.Error(), nil); errState != nil { logger.WithError(errState).Errorf("Sync set model state failed for model %s replicaidx %d", modelName, replicaIdx) } continue } - err = s.store.UpdateModelState(latestModel.Key(), latestModel.GetVersion(), latestModel.Server(), replicaIdx, nil, store.LoadRequested, store.Loading, "", nil) + err = s.store.UpdateModelState(latestModel.ModelName(), latestModel.GetVersion(), + latestModel.Server, replicaIdx, nil, db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_REQUESTED, + db.ModelReplicaState_MODEL_REPLICA_STATE_LOADING, "", nil) if err != nil { logger.WithError(err).Errorf("Sync set model state failed for model %s replicaidx %d", modelName, replicaIdx) continue @@ -317,29 +319,30 @@ func (s *Server) Sync(modelName string) { // Loop through all versions and unload any requested - any version of a model might have an unload request for _, modelVersion := range model.Versions { - for _, replicaIdx := range modelVersion.GetReplicaForState(store.UnloadRequested) { + for _, replicaIdx := range modelVersion.GetReplicaForState(db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_REQUESTED) { s.logger.Infof("Sending unload model request for %s:%d", modelName, modelVersion.GetVersion()) - as, ok := s.agents[ServerKey{serverName: modelVersion.Server(), replicaIdx: uint32(replicaIdx)}] + as, ok := s.agents[ServerKey{serverName: modelVersion.Server, replicaIdx: uint32(replicaIdx)}] if !ok { - logger.Errorf("Failed to find server replica for %s:%d", modelVersion.Server(), replicaIdx) + logger.Errorf("Failed to find server replica for %s:%d", modelVersion.Server, replicaIdx) continue } err = as.Send(&pb.ModelOperationMessage{ Operation: pb.ModelOperationMessage_UNLOAD_MODEL, - ModelVersion: &pb.ModelVersion{Model: modelVersion.GetModel(), Version: modelVersion.GetVersion()}, + ModelVersion: &pb.ModelVersion{Model: modelVersion.ModelDefn, Version: modelVersion.GetVersion()}, }) if err != nil { logger.WithError(err).Errorf("stream message send failed for model %s and replicaidx %d", modelName, replicaIdx) if errState := s.store.UpdateModelState( - latestModel.Key(), latestModel.GetVersion(), latestModel.Server(), replicaIdx, nil, - store.UnloadRequested, store.UnloadFailed, err.Error(), nil); errState != nil { + latestModel.ModelName(), latestModel.GetVersion(), latestModel.Server, replicaIdx, nil, + db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_REQUESTED, db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_FAILED, err.Error(), nil); errState != nil { logger.WithError(errState).Errorf("Sync set model state failed for model %s replicaidx %d", modelName, replicaIdx) } continue } - err = s.store.UpdateModelState(modelVersion.Key(), modelVersion.GetVersion(), modelVersion.Server(), replicaIdx, nil, store.UnloadRequested, store.Unloading, "", nil) + err = s.store.UpdateModelState(modelVersion.ModelName(), modelVersion.GetVersion(), modelVersion.Server, + replicaIdx, nil, db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_REQUESTED, db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOADING, "", nil) if err != nil { logger.WithError(err).Errorf("Sync set model state failed for model %s replicaidx %d", modelName, replicaIdx) continue @@ -357,24 +360,24 @@ func (s *Server) AgentDrain(ctx context.Context, message *pb.AgentDrainRequest) func (s *Server) AgentEvent(ctx context.Context, message *pb.ModelEventMessage) (*pb.ModelEventResponse, error) { logger := s.logger.WithField("func", "AgentEvent") - var desiredState store.ModelReplicaState - var expectedState store.ModelReplicaState + var desiredState db.ModelReplicaState + var expectedState db.ModelReplicaState switch message.Event { case pb.ModelEventMessage_LOADED: - expectedState = store.Loading - desiredState = store.Loaded + expectedState = db.ModelReplicaState_MODEL_REPLICA_STATE_LOADING + desiredState = db.ModelReplicaState_MODEL_REPLICA_STATE_LOADED case pb.ModelEventMessage_UNLOADED: - expectedState = store.Unloading - desiredState = store.Unloaded + expectedState = db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOADING + desiredState = db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOADED case pb.ModelEventMessage_LOAD_FAILED, pb.ModelEventMessage_LOAD_FAIL_MEMORY: - expectedState = store.Loading - desiredState = store.LoadFailed + expectedState = db.ModelReplicaState_MODEL_REPLICA_STATE_LOADING + desiredState = db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_FAILED case pb.ModelEventMessage_UNLOAD_FAILED: - expectedState = store.Unloading - desiredState = store.UnloadFailed + expectedState = db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOADING + desiredState = db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_FAILED default: - desiredState = store.ModelReplicaStateUnknown + desiredState = db.ModelReplicaState_MODEL_REPLICA_STATE_UNKNOWN } logger.Infof("Updating state for model %s to %s", message.ModelName, desiredState.String()) s.store.LockModel(message.ModelName) @@ -581,10 +584,10 @@ func (s *Server) updateAndSchedule(modelProtos *pbs.Model) error { return s.scheduler.Schedule(modelName) } -func createScalingPseudoRequest(message *pb.ModelScalingTriggerMessage, model *store.ModelSnapshot) (*pbs.Model, error) { +func createScalingPseudoRequest(message *pb.ModelScalingTriggerMessage, model *db.Model) (*pbs.Model, error) { modelName := message.ModelName - lastModelVersion := model.GetLatest() + lastModelVersion := model.Latest() if lastModelVersion == nil { return nil, fmt.Errorf("Model %s does not exist yet, possibly due to scheduler restarting", modelName) } @@ -602,14 +605,14 @@ func createScalingPseudoRequest(message *pb.ModelScalingTriggerMessage, model *s modelName, lastModelVersion.GetVersion(), message.GetModelVersion()) } - modelProtos := lastModelVersion.GetModel() // this is a clone of the protos + modelProtos := lastModelVersion.ModelDefn // this is a clone of the protos // if we are scaling up: // the model should be available // if we are scaling down: // we reduce the replicas by one and try our best // if we have a draining replica while scaling down, this should be still fine I think? - numReplicas := int(lastModelVersion.GetDeploymentSpec().Replicas) + numReplicas := int(lastModelVersion.ModelDefn.DeploymentSpec.Replicas) if tryScaleDown { if !isModelStable(lastModelVersion) { @@ -625,8 +628,8 @@ func createScalingPseudoRequest(message *pb.ModelScalingTriggerMessage, model *s return modelProtos, nil } -func isModelStable(modelVersion *store.ModelVersion) bool { - return modelVersion.ModelState().Timestamp.Before(time.Now().Add(-modelScalingCoolingDownSeconds * time.Second)) +func isModelStable(modelVersion *db.ModelVersion) bool { + return modelVersion.State.Timestamp.AsTime().Before(time.Now().Add(-modelScalingCoolingDownSeconds * time.Second)) } func calculateDesiredNumReplicas(model *pbs.Model, trigger pb.ModelScalingTriggerMessage_Trigger, numReplicas int) (int, error) { diff --git a/scheduler/pkg/agent/server_test.go b/scheduler/pkg/agent/server_test.go index 3ba303f2a2..7336338f03 100644 --- a/scheduler/pkg/agent/server_test.go +++ b/scheduler/pkg/agent/server_test.go @@ -48,7 +48,7 @@ type mockStore struct { models map[string]*store.ModelSnapshot } -var _ store.ModelStore = (*mockStore)(nil) +var _ store.ModelServerAPI = (*mockStore)(nil) func (m *mockStore) FailedScheduling(modelID string, version uint32, reason string, reset bool) error { return nil diff --git a/scheduler/pkg/envoy/processor/incremental.go b/scheduler/pkg/envoy/processor/incremental.go index b8f5848017..58babf71f4 100644 --- a/scheduler/pkg/envoy/processor/incremental.go +++ b/scheduler/pkg/envoy/processor/incremental.go @@ -15,6 +15,7 @@ import ( "sync" "time" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/sirupsen/logrus" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" @@ -38,7 +39,7 @@ type IncrementalProcessor struct { logger logrus.FieldLogger xdsCache *xdscache.SeldonXDSCache mu sync.RWMutex - modelStore store.ModelStore + modelStore store.ModelServerAPI experimentServer experiment.ExperimentServer pipelineHandler pipeline.PipelineHandler runEnvoyBatchUpdates bool @@ -57,7 +58,7 @@ type pendingModelVersion struct { func NewIncrementalProcessor( nodeID string, logger logrus.FieldLogger, - modelStore store.ModelStore, + modelStore store.ModelServerAPI, experimentServer experiment.ExperimentServer, pipelineHandler pipeline.PipelineHandler, hub *coordinator.EventHub, @@ -205,21 +206,21 @@ func (p *IncrementalProcessor) removeRouteForServerInEnvoyCache(routeName string return nil } -func (p *IncrementalProcessor) updateEnvoyForModelVersion(routeName string, modelVersion *store.ModelVersion, server *store.ServerSnapshot, trafficPercent uint32, isMirror bool) { +func (p *IncrementalProcessor) updateEnvoyForModelVersion(routeName string, modelVersion *db.ModelVersion, server *db.Server, trafficPercent uint32, isMirror bool) { logger := p.logger.WithField("func", "updateEnvoyForModelVersion") assignment := modelVersion.GetAssignment() if len(assignment) == 0 { logger.Debugf("Not updating route: %s - no assigned replicas for %v", routeName, modelVersion) return } - modelName := modelVersion.GetMeta().GetName() + modelName := modelVersion.ModelDefn.Meta.Name modelVersionNumber := modelVersion.GetVersion() httpClusterName, grpcClusterName := getClusterNames(modelName, modelVersionNumber) p.xdsCache.AddClustersForRoute(routeName, modelName, httpClusterName, grpcClusterName, modelVersionNumber, assignment, server) logPayloads := false - if modelVersion.GetDeploymentSpec() != nil { - logPayloads = modelVersion.GetDeploymentSpec().LogPayloads + if modelVersion.ModelDefn.DeploymentSpec != nil { + logPayloads = modelVersion.ModelDefn.DeploymentSpec.LogPayloads } else { logger.Warnf("model %s has not deployment spec", modelName) } @@ -233,7 +234,7 @@ func getClusterNames(modelVersion string, modelVersionNumber uint32) (string, st return httpClusterName, grpcClusterName } -func getTrafficShare(latestModel *store.ModelVersion, lastAvailableModelVersion *store.ModelVersion, weight uint32) (uint32, uint32) { +func getTrafficShare(latestModel *db.ModelVersion, lastAvailableModelVersion *db.ModelVersion, weight uint32) (uint32, uint32) { lastAvailableReplicas := len(lastAvailableModelVersion.GetAssignment()) latestReplicas := len(latestModel.GetAssignment()) totalReplicas := lastAvailableReplicas + latestReplicas @@ -242,11 +243,11 @@ func getTrafficShare(latestModel *store.ModelVersion, lastAvailableModelVersion return trafficLatestModel, trafficLastAvailableModel } -func (p *IncrementalProcessor) addModelTraffic(routeName string, model *store.ModelSnapshot, weight uint32, isMirror bool) error { +func (p *IncrementalProcessor) addModelTraffic(routeName string, model *db.Model, weight uint32, isMirror bool) error { logger := p.logger.WithField("func", "addModelTraffic") modelName := model.Name - latestModel := model.GetLatest() + latestModel := model.Latest() if latestModel == nil || !model.CanReceiveTraffic() { if latestModel == nil { logger.Infof("latest model is nil for model %s route %s", model.Name, routeName) @@ -254,7 +255,7 @@ func (p *IncrementalProcessor) addModelTraffic(routeName string, model *store.Mo return fmt.Errorf("no live replica for model %s for model route %s", model.Name, routeName) } - server, err := p.modelStore.GetServer(latestModel.Server(), false, false) + server, _, err := p.modelStore.GetServer(latestModel.Server, false, false) if err != nil { return err } @@ -262,9 +263,9 @@ func (p *IncrementalProcessor) addModelTraffic(routeName string, model *store.Mo lastAvailableModelVersion := model.GetLastAvailableModel() if lastAvailableModelVersion != nil && latestModel.GetVersion() != lastAvailableModelVersion.GetVersion() { trafficLatestModel, trafficLastAvailableModel := getTrafficShare(latestModel, lastAvailableModelVersion, weight) - lastAvailableServer, err := p.modelStore.GetServer(lastAvailableModelVersion.Server(), false, false) + lastAvailableServer, _, err := p.modelStore.GetServer(lastAvailableModelVersion.Server, false, false) if err != nil { - logger.WithError(err).Errorf("Failed to find server %s for last available model %s", lastAvailableModelVersion.Server(), modelName) + logger.WithError(err).Errorf("Failed to find server %s for last available model %s", lastAvailableModelVersion.Server, modelName) return err } @@ -284,7 +285,7 @@ func (p *IncrementalProcessor) addModelTraffic(routeName string, model *store.Mo return nil } -func (p *IncrementalProcessor) addExperimentModelBaselineTraffic(model *store.ModelSnapshot, exp *experiment.Experiment) error { +func (p *IncrementalProcessor) addExperimentModelBaselineTraffic(model *db.Model, exp *experiment.Experiment) error { logger := p.logger.WithField("func", "addExperimentModelBaselineTraffic") logger.Infof("Trying to setup experiment for %s", model.Name) if exp.Default == nil { @@ -320,7 +321,7 @@ func (p *IncrementalProcessor) addExperimentModelBaselineTraffic(model *store.Mo return nil } -func (p *IncrementalProcessor) addModel(model *store.ModelSnapshot) error { +func (p *IncrementalProcessor) addModel(model *db.Model) error { logger := p.logger.WithField("func", "addTraffic") exp := p.experimentServer.GetExperimentForBaselineModel(model.Name) if exp != nil { @@ -526,7 +527,7 @@ func (p *IncrementalProcessor) modelUpdate(modelName string) error { return p.updateEnvoy() // in practice we should not be here } - latestModel := model.GetLatest() + latestModel := model.Latest() if latestModel == nil { logger.Debugf("sync: No latest model - removing for %s", modelName) if err := p.removeRouteForServerInEnvoyCache(modelName); err != nil { @@ -551,7 +552,7 @@ func (p *IncrementalProcessor) modelUpdate(modelName string) error { } if !modelRemoved { - _, err = p.modelStore.GetServer(latestModel.Server(), false, false) + _, _, err = p.modelStore.GetServer(latestModel.Server, false, false) if err != nil { logger.Debugf("sync: No server - removing for %s", modelName) if err := p.removeRouteForServerInEnvoyCache(modelName); err != nil { @@ -645,10 +646,10 @@ func (p *IncrementalProcessor) modelSync() { logger.Debugf("Calling model sync") envoyErr := p.updateEnvoy() - serverReplicaState := store.Available + serverReplicaState := db.ModelReplicaState_MODEL_REPLICA_STATE_AVAILABLE reason := "" if envoyErr != nil { - serverReplicaState = store.LoadedUnavailable + serverReplicaState = db.ModelReplicaState_MODEL_REPLICA_STATE_LOADED_UNAVAILABLE reason = envoyErr.Error() } @@ -669,9 +670,9 @@ func (p *IncrementalProcessor) modelSync() { continue } - s, err := p.modelStore.GetServer(v.Server(), false, false) + s, _, err := p.modelStore.GetServer(v.Server, false, false) if err != nil { - logger.Debugf("Failed to get server for model %s server %s", mv.name, v.Server()) + logger.Debugf("Failed to get server for model %s server %s", mv.name, v.Server) p.modelStore.UnlockModel(mv.name) continue } @@ -681,7 +682,7 @@ func (p *IncrementalProcessor) modelSync() { for _, replicaIdx := range v.GetAssignment() { serverReplicaExpectedState := vs[replicaIdx].State // Ignore draining nodes to be changed to Available/Failed state - if serverReplicaExpectedState != store.Draining { + if serverReplicaExpectedState != db.ModelReplicaState_MODEL_REPLICA_STATE_DRAINING { err2 := p.modelStore.UpdateModelState( mv.name, v.GetVersion(), @@ -700,12 +701,12 @@ func (p *IncrementalProcessor) modelSync() { } else { logger.Debugf( "Skipping replica for model %s in state %s server replica %s%d as no longer in Loaded state", - mv.name, serverReplicaExpectedState.String(), v.Server(), replicaIdx) + mv.name, serverReplicaExpectedState.String(), v.Server, replicaIdx) } } // Go through all replicas that we have set to UnloadEnvoyRequested and mark them as UnloadRequested // to resume the unload process from servers - for _, replicaIdx := range v.GetReplicaForState(store.UnloadEnvoyRequested) { + for _, replicaIdx := range v.GetReplicaForState(db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_ENVOY_REQUESTED) { serverReplicaExpectedState := vs[replicaIdx].State if err := p.modelStore.UpdateModelState( mv.name, @@ -714,7 +715,7 @@ func (p *IncrementalProcessor) modelSync() { replicaIdx, nil, serverReplicaExpectedState, - store.UnloadRequested, + db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_ENVOY_REQUESTED, "", nil, ); err != nil { diff --git a/scheduler/pkg/envoy/xdscache/seldoncache.go b/scheduler/pkg/envoy/xdscache/seldoncache.go index ff69fbbcbe..88d2ab91d2 100644 --- a/scheduler/pkg/envoy/xdscache/seldoncache.go +++ b/scheduler/pkg/envoy/xdscache/seldoncache.go @@ -21,12 +21,12 @@ import ( "github.com/envoyproxy/go-control-plane/pkg/cache/v3" "github.com/envoyproxy/go-control-plane/pkg/resource/v3" "github.com/envoyproxy/go-control-plane/pkg/server/stream/v3" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/sirupsen/logrus" seldontls "github.com/seldonio/seldon-core/components/tls/v2/pkg/tls" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" - "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/pipeline" "github.com/seldonio/seldon-core/scheduler/v2/pkg/util" ) @@ -473,7 +473,7 @@ func (xds *SeldonXDSCache) AddClustersForRoute( routeName, modelName, httpClusterName, grpcClusterName string, modelVersion uint32, assignment []int, - server *store.ServerSnapshot, + server *db.Server, ) { xds.mu.Lock() defer xds.mu.Unlock() @@ -503,7 +503,7 @@ func (xds *SeldonXDSCache) AddClustersForRoute( } for _, replicaIdx := range assignment { - replica, ok := server.Replicas[replicaIdx] + replica, ok := server.Replicas[int32(replicaIdx)] if !ok { logger.Warnf("Invalid replica index %d for server %s", replicaIdx, server.Name) } else { diff --git a/scheduler/pkg/kafka/conflict-resolution/conflict_resolution.go b/scheduler/pkg/kafka/conflict-resolution/conflict_resolution.go index 23bca16b21..3e010e71b5 100644 --- a/scheduler/pkg/kafka/conflict-resolution/conflict_resolution.go +++ b/scheduler/pkg/kafka/conflict-resolution/conflict_resolution.go @@ -12,12 +12,12 @@ package conflict_resolution import ( "fmt" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" log "github.com/sirupsen/logrus" "github.com/seldonio/seldon-core/apis/go/v2/mlops/chainer" pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" - "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/pipeline" ) @@ -212,43 +212,43 @@ func IsPipelineMessageOutdated( // -------------------- func CreateNewModelIteration( - cr *ConflictResolutioner[store.ModelState], + cr *ConflictResolutioner[db.ModelState], modelName string, servers []string, ) { - cr.CreateNewIteration(modelName, servers, store.ModelStateUnknown) + cr.CreateNewIteration(modelName, servers, db.ModelState_MODEL_STATE_UNKNOWN) } func GetModelStatus( - cr *ConflictResolutioner[store.ModelState], + cr *ConflictResolutioner[db.ModelState], modelName string, message *pb.ModelUpdateStatusMessage, -) (store.ModelState, string) { +) (db.ModelState, string) { logger := cr.logger.WithField("func", "GetModelStatus") streams := cr.VectorResponseStatus[modelName] var messageStr = "" - readyCount := cr.GetCountWithStatus(modelName, store.ModelAvailable) + readyCount := cr.GetCountWithStatus(modelName, db.ModelState_MODEL_STATE_AVAILABLE) if readyCount > 0 { messageStr += fmt.Sprintf("%d/%d ready ", readyCount, len(streams)) } - terminatedCount := cr.GetCountWithStatus(modelName, store.ModelTerminated) + terminatedCount := cr.GetCountWithStatus(modelName, db.ModelState_MODEL_STATE_TERMINATED) if terminatedCount > 0 { messageStr += fmt.Sprintf("%d/%d terminated ", terminatedCount, len(streams)) } - failedCount := cr.GetCountWithStatus(modelName, store.ModelFailed) + failedCount := cr.GetCountWithStatus(modelName, db.ModelState_MODEL_STATE_FAILED) if failedCount > 0 { messageStr += fmt.Sprintf("%d/%d failed ", failedCount, len(streams)) } - terminatedFailedCount := cr.GetCountWithStatus(modelName, store.ModelTerminateFailed) + terminatedFailedCount := cr.GetCountWithStatus(modelName, db.ModelState_MODEL_STATE_TERMINATE_FAILED) if terminatedFailedCount > 0 { messageStr += fmt.Sprintf("%d/%d terminate failed ", terminatedFailedCount, len(streams)) } - unknownCount := cr.GetCountWithStatus(modelName, store.ModelStateUnknown) + unknownCount := cr.GetCountWithStatus(modelName, db.ModelState_MODEL_STATE_UNKNOWN) logger.Infof("Model %s status counts: %s", modelName, messageStr) if message.Update.Op == pb.ModelUpdateMessage_Create { @@ -258,29 +258,29 @@ func GetModelStatus( // TODO: Implement something similar to models to display the numbers // of available replicas if failedCount == len(streams) { - return store.ModelFailed, messageStr + return db.ModelState_MODEL_STATE_FAILED, messageStr } if readyCount > 0 && unknownCount == 0 { - return store.ModelAvailable, messageStr + return db.ModelState_MODEL_STATE_AVAILABLE, messageStr } - return store.ModelProgressing, messageStr + return db.ModelState_MODEL_STATE_PROGRESSING, messageStr } if message.Update.Op == pb.ModelUpdateMessage_Delete { if failedCount > 0 { - return store.ModelTerminateFailed, messageStr + return db.ModelState_MODEL_STATE_TERMINATE_FAILED, messageStr } if terminatedCount == len(streams) { - return store.ModelTerminated, messageStr + return db.ModelState_MODEL_STATE_TERMINATED, messageStr } - return store.ModelTerminating, messageStr + return db.ModelState_MODEL_STATE_TERMINATING, messageStr } - return store.ModelStateUnknown, "Unknown operation or status" + return db.ModelState_MODEL_STATE_UNKNOWN, "Unknown operation or status" } func IsModelMessageOutdated( - cr *ConflictResolutioner[store.ModelState], + cr *ConflictResolutioner[db.ModelState], message *pb.ModelUpdateStatusMessage, ) bool { timestamp := message.Update.Timestamp diff --git a/scheduler/pkg/scheduler/cleaner/test_versions_hack.go b/scheduler/pkg/scheduler/cleaner/test_versions_hack.go index c38a72f879..652f308ee0 100644 --- a/scheduler/pkg/scheduler/cleaner/test_versions_hack.go +++ b/scheduler/pkg/scheduler/cleaner/test_versions_hack.go @@ -19,7 +19,7 @@ type TestVersionCleaner struct { *VersionCleaner } -func NewTestVersionCleaner(schedStore store.ModelStore, logger log.FieldLogger) *TestVersionCleaner { +func NewTestVersionCleaner(schedStore store.ModelServerAPI, logger log.FieldLogger) *TestVersionCleaner { return &TestVersionCleaner{ VersionCleaner: NewVersionCleaner(schedStore, logger), } diff --git a/scheduler/pkg/scheduler/cleaner/versions.go b/scheduler/pkg/scheduler/cleaner/versions.go index d314a8bbe4..2a2c97d29e 100644 --- a/scheduler/pkg/scheduler/cleaner/versions.go +++ b/scheduler/pkg/scheduler/cleaner/versions.go @@ -12,6 +12,7 @@ package cleaner import ( "fmt" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" log "github.com/sirupsen/logrus" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" @@ -22,12 +23,12 @@ type ModelVersionCleaner interface { } type VersionCleaner struct { - store store.ModelStore + store store.ModelServerAPI logger log.FieldLogger } func NewVersionCleaner( - schedStore store.ModelStore, + schedStore store.ModelServerAPI, logger log.FieldLogger, ) *VersionCleaner { return &VersionCleaner{ @@ -59,13 +60,13 @@ func (v *VersionCleaner) cleanupOldVersions(modelName string) error { return err } if model == nil { - return fmt.Errorf("Can't find model with key %s", modelName) + return fmt.Errorf("can't find model with key %s", modelName) } - latest := model.GetLatest() + latest := model.Latest() if latest == nil { - return fmt.Errorf("Failed to find latest model for %s", modelName) + return fmt.Errorf("failed to find latest model for %s", modelName) } - if latest.ModelState().State == store.ModelAvailable { + if latest.State.State == db.ModelState_MODEL_STATE_AVAILABLE { for _, mv := range model.GetVersionsBeforeLastAvailable() { _, err := v.store.UnloadVersionModels(modelName, mv.GetVersion()) if err != nil { @@ -73,7 +74,7 @@ func (v *VersionCleaner) cleanupOldVersions(modelName string) error { } } } - if latest.ModelState().ModelGwState == store.ModelAvailable { + if latest.State.ModelGwState == db.ModelState_MODEL_STATE_AVAILABLE { for _, mv := range model.GetVersionsBeforeLastModelGwAvailable() { _, err := v.store.UnloadModelGwVersionModels(modelName, mv.GetVersion()) if err != nil { diff --git a/scheduler/pkg/scheduler/filters/deletedserver.go b/scheduler/pkg/scheduler/filters/deletedserver.go index 79c2374780..2273dd91ac 100644 --- a/scheduler/pkg/scheduler/filters/deletedserver.go +++ b/scheduler/pkg/scheduler/filters/deletedserver.go @@ -12,7 +12,7 @@ package filters import ( "fmt" - "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" ) type DeletedServerFilter struct{} @@ -21,10 +21,10 @@ func (e DeletedServerFilter) Name() string { return "DeletedServerFilter" } -func (e DeletedServerFilter) Filter(model *store.ModelVersion, server *store.ServerSnapshot) bool { +func (e DeletedServerFilter) Filter(model *db.ModelVersion, server *db.Server) bool { return server.ExpectedReplicas != 0 } -func (e DeletedServerFilter) Description(model *store.ModelVersion, server *store.ServerSnapshot) string { +func (e DeletedServerFilter) Description(model *db.ModelVersion, server *db.Server) string { return fmt.Sprintf("expected replicas %d != 0", server.ExpectedReplicas) } diff --git a/scheduler/pkg/scheduler/filters/explainer.go b/scheduler/pkg/scheduler/filters/explainer.go index 7c3eb23927..a5c2434cdd 100644 --- a/scheduler/pkg/scheduler/filters/explainer.go +++ b/scheduler/pkg/scheduler/filters/explainer.go @@ -12,7 +12,7 @@ package filters import ( "fmt" - "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" ) const ( @@ -25,8 +25,8 @@ func (s ExplainerFilter) Name() string { return "ExplainerFilter" } -func (s ExplainerFilter) Filter(model *store.ModelVersion, replica *store.ServerReplica) bool { - if model.GetModel().GetModelSpec().GetExplainer() != nil { +func (s ExplainerFilter) Filter(model *db.ModelVersion, replica *db.ServerReplica) bool { + if model.ModelDefn.ModelSpec.GetExplainer() != nil { for _, capability := range replica.GetCapabilities() { if alibiExplainerRequiredCapability == capability { return true @@ -37,6 +37,6 @@ func (s ExplainerFilter) Filter(model *store.ModelVersion, replica *store.Server return true } -func (s ExplainerFilter) Description(model *store.ModelVersion, replica *store.ServerReplica) string { - return fmt.Sprintf("model is explainer %v replica capabilities %v", model.GetModel().GetModelSpec().GetExplainer() == nil, replica.GetCapabilities()) +func (s ExplainerFilter) Description(model *db.ModelVersion, replica *db.ServerReplica) string { + return fmt.Sprintf("model is explainer %v replica capabilities %v", model.ModelDefn.GetModelSpec().GetExplainer() == nil, replica.GetCapabilities()) } diff --git a/scheduler/pkg/scheduler/filters/interface.go b/scheduler/pkg/scheduler/filters/interface.go index fc9f65fc31..4da372c2f8 100644 --- a/scheduler/pkg/scheduler/filters/interface.go +++ b/scheduler/pkg/scheduler/filters/interface.go @@ -9,16 +9,18 @@ the Change License after the Change Date as each is defined in accordance with t package filters -import "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" +import ( + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" +) type ReplicaFilter interface { Name() string - Filter(model *store.ModelVersion, replica *store.ServerReplica) bool - Description(model *store.ModelVersion, replica *store.ServerReplica) string + Filter(model *db.ModelVersion, replica *db.ServerReplica) bool + Description(model *db.ModelVersion, replica *db.ServerReplica) string } type ServerFilter interface { Name() string - Filter(model *store.ModelVersion, server *store.ServerSnapshot) bool - Description(model *store.ModelVersion, server *store.ServerSnapshot) string + Filter(model *db.ModelVersion, server *db.Server) bool + Description(model *db.ModelVersion, server *db.Server) string } diff --git a/scheduler/pkg/scheduler/filters/replicadraining.go b/scheduler/pkg/scheduler/filters/replicadraining.go index ccdec6d4e9..580919017b 100644 --- a/scheduler/pkg/scheduler/filters/replicadraining.go +++ b/scheduler/pkg/scheduler/filters/replicadraining.go @@ -12,7 +12,7 @@ package filters import ( "fmt" - "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" ) type ReplicaDrainingFilter struct{} @@ -21,11 +21,11 @@ func (r ReplicaDrainingFilter) Name() string { return "ReplicaDrainingFilter" } -func (r ReplicaDrainingFilter) Filter(model *store.ModelVersion, replica *store.ServerReplica) bool { +func (r ReplicaDrainingFilter) Filter(model *db.ModelVersion, replica *db.ServerReplica) bool { return !replica.GetIsDraining() } -func (r ReplicaDrainingFilter) Description(model *store.ModelVersion, replica *store.ServerReplica) string { +func (r ReplicaDrainingFilter) Description(model *db.ModelVersion, replica *db.ServerReplica) string { return fmt.Sprintf( "Replica server %d is draining check %t", replica.GetReplicaIdx(), replica.GetIsDraining()) diff --git a/scheduler/pkg/scheduler/filters/replicamemory.go b/scheduler/pkg/scheduler/filters/replicamemory.go index 854ebb7c17..d7e793dd17 100644 --- a/scheduler/pkg/scheduler/filters/replicamemory.go +++ b/scheduler/pkg/scheduler/filters/replicamemory.go @@ -13,7 +13,7 @@ import ( "fmt" "math" - "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" ) type AvailableMemoryReplicaFilter struct{} @@ -22,18 +22,18 @@ func (r AvailableMemoryReplicaFilter) Name() string { return "AvailableMemoryReplicaFilter" } -func isModelReplicaLoadedOnServerReplica(model *store.ModelVersion, replica *store.ServerReplica) bool { +func isModelReplicaLoadedOnServerReplica(model *db.ModelVersion, replica *db.ServerReplica) bool { if model.HasServer() { - return model.Server() == replica.GetServerName() && model.GetModelReplicaState(replica.GetReplicaIdx()).AlreadyLoadingOrLoaded() + return model.Server == replica.GetServerName() && model.GetModelReplicaState(int(replica.GetReplicaIdx())).AlreadyLoadingOrLoaded() } return false } -func (r AvailableMemoryReplicaFilter) Filter(model *store.ModelVersion, replica *store.ServerReplica) bool { +func (r AvailableMemoryReplicaFilter) Filter(model *db.ModelVersion, replica *db.ServerReplica) bool { mem := math.Max(0, float64(replica.GetAvailableMemory())-float64(replica.GetReservedMemory())) return model.GetRequiredMemory() <= uint64(mem) || isModelReplicaLoadedOnServerReplica(model, replica) } -func (r AvailableMemoryReplicaFilter) Description(model *store.ModelVersion, replica *store.ServerReplica) string { +func (r AvailableMemoryReplicaFilter) Description(model *db.ModelVersion, replica *db.ServerReplica) string { return fmt.Sprintf("model memory %d replica memory %d", model.GetRequiredMemory(), replica.GetAvailableMemory()) } diff --git a/scheduler/pkg/scheduler/filters/serverreplicas.go b/scheduler/pkg/scheduler/filters/serverreplicas.go index fb400dcb1a..61ae55fa01 100644 --- a/scheduler/pkg/scheduler/filters/serverreplicas.go +++ b/scheduler/pkg/scheduler/filters/serverreplicas.go @@ -12,7 +12,7 @@ package filters import ( "fmt" - "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" ) type ServerReplicaFilter struct{} @@ -21,10 +21,10 @@ func (r ServerReplicaFilter) Name() string { return "ServerReplicaFilter" } -func (r ServerReplicaFilter) Filter(model *store.ModelVersion, server *store.ServerSnapshot) bool { +func (r ServerReplicaFilter) Filter(model *db.ModelVersion, server *db.Server) bool { return len(server.Replicas) > 0 } -func (r ServerReplicaFilter) Description(model *store.ModelVersion, server *store.ServerSnapshot) string { +func (r ServerReplicaFilter) Description(model *db.ModelVersion, server *db.Server) string { return fmt.Sprintf("%d server replicas (waiting for server replicas to connect)", len(server.Replicas)) } diff --git a/scheduler/pkg/scheduler/filters/serverrequirements.go b/scheduler/pkg/scheduler/filters/serverrequirements.go index cac6e937b8..6fd721cc34 100644 --- a/scheduler/pkg/scheduler/filters/serverrequirements.go +++ b/scheduler/pkg/scheduler/filters/serverrequirements.go @@ -13,7 +13,7 @@ import ( "fmt" "strings" - "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" ) type ServerRequirementFilter struct{} @@ -22,7 +22,7 @@ func (s ServerRequirementFilter) Name() string { return "ServerRequirementsFilter" } -func (s ServerRequirementFilter) Filter(model *store.ModelVersion, server *store.ServerSnapshot) bool { +func (s ServerRequirementFilter) Filter(model *db.ModelVersion, server *db.Server) bool { if len(server.Replicas) == 0 { // Capabilities are currently stored on replicas, so no replicas means no capabilities can be determined. return false @@ -52,14 +52,14 @@ func contains(capabilities []string, requirement string) bool { return false } -func getFirstAvailableReplicaCapabilities(replicas map[int]*store.ServerReplica) []string { +func getFirstAvailableReplicaCapabilities(replicas map[int32]*db.ServerReplica) []string { for _, replica := range replicas { return replica.GetCapabilities() } return []string{} } -func (s ServerRequirementFilter) Description(model *store.ModelVersion, server *store.ServerSnapshot) string { +func (s ServerRequirementFilter) Description(model *db.ModelVersion, server *db.Server) string { requirements := model.GetRequirements() replicas := server.Replicas diff --git a/scheduler/pkg/scheduler/filters/sharing.go b/scheduler/pkg/scheduler/filters/sharing.go index c955e2e48a..ca9b80be85 100644 --- a/scheduler/pkg/scheduler/filters/sharing.go +++ b/scheduler/pkg/scheduler/filters/sharing.go @@ -12,7 +12,7 @@ package filters import ( "fmt" - "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" ) type SharingServerFilter struct{} @@ -21,12 +21,12 @@ func (e SharingServerFilter) Name() string { return "SharingServerFilter" } -func (e SharingServerFilter) Filter(model *store.ModelVersion, server *store.ServerSnapshot) bool { +func (e SharingServerFilter) Filter(model *db.ModelVersion, server *db.Server) bool { requestedServer := model.GetRequestedServer() return (requestedServer == nil && server.Shared) || (requestedServer != nil && *requestedServer == server.Name) } -func (e SharingServerFilter) Description(model *store.ModelVersion, server *store.ServerSnapshot) string { +func (e SharingServerFilter) Description(model *db.ModelVersion, server *db.Server) string { requestedServer := model.GetRequestedServer() if requestedServer != nil { return fmt.Sprintf("requested server %s == %s", *requestedServer, server.Name) diff --git a/scheduler/pkg/scheduler/scheduler.go b/scheduler/pkg/scheduler/scheduler.go index d9c849285d..63d6feb476 100644 --- a/scheduler/pkg/scheduler/scheduler.go +++ b/scheduler/pkg/scheduler/scheduler.go @@ -17,6 +17,7 @@ import ( "strings" "sync" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" log "github.com/sirupsen/logrus" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" @@ -30,7 +31,7 @@ const serverScaleupEventSource = "scheduler.server.scaleup" type SimpleScheduler struct { muSortAndUpdate sync.Mutex - store store.ModelStore + store store.ModelServerAPI logger log.FieldLogger synchroniser synchroniser.Synchroniser eventHub *coordinator.EventHub @@ -45,7 +46,7 @@ type SchedulerConfig struct { replicaSorts []sorters.ReplicaSorter } -func DefaultSchedulerConfig(store store.ModelStore) SchedulerConfig { +func DefaultSchedulerConfig(store store.ModelServerAPI) SchedulerConfig { return SchedulerConfig{ serverFilters: []filters.ServerFilter{filters.ServerReplicaFilter{}, filters.SharingServerFilter{}, filters.DeletedServerFilter{}, filters.ServerRequirementFilter{}}, replicaFilters: []filters.ReplicaFilter{filters.AvailableMemoryReplicaFilter{}, filters.ExplainerFilter{}, filters.ReplicaDrainingFilter{}}, @@ -55,7 +56,7 @@ func DefaultSchedulerConfig(store store.ModelStore) SchedulerConfig { } func NewSimpleScheduler(logger log.FieldLogger, - store store.ModelStore, + store store.ModelServerAPI, schedulerConfig SchedulerConfig, synchroniser synchroniser.Synchroniser, eventHub *coordinator.EventHub, @@ -124,12 +125,12 @@ func (s *SimpleScheduler) getFailedModels() ([]string, error) { var failedModels []string for _, model := range models { - version := model.GetLatest() + version := model.Latest() if version != nil { - versionState := version.ModelState() - if versionState.State == store.ModelFailed || versionState.State == store.ScheduleFailed || - ((versionState.State == store.ModelAvailable || versionState.State == store.ModelProgressing) && - versionState.AvailableReplicas < version.GetDeploymentSpec().GetReplicas()) { + versionState := version.State + if versionState.State == db.ModelState_MODEL_STATE_FAILED || versionState.State == db.ModelState_MODEL_STATE_SCHEDULE_FAILED || + ((versionState.State == db.ModelState_MODEL_STATE_AVAILABLE || versionState.State == db.ModelState_MODEL_STATE_PROGRESSING) && + versionState.AvailableReplicas < version.ModelDefn.DeploymentSpec.GetReplicas()) { failedModels = append(failedModels, model.Name) } } @@ -155,7 +156,7 @@ func (s *SimpleScheduler) scheduleToServer(modelName string) (*coordinator.Serve return nil, errors.New("Unable to find model") } - latestModel := model.GetLatest() + latestModel := model.Latest() if latestModel == nil { return nil, errors.New("Unable to find latest version for model") } @@ -166,7 +167,7 @@ func (s *SimpleScheduler) scheduleToServer(modelName string) (*coordinator.Serve // - otherwise proceed a normal server := "" if latestModel.HasServer() { - server = latestModel.Server() + server = latestModel.Server } logger.Debug("Ensuring deleted model is removed") @@ -183,7 +184,7 @@ func (s *SimpleScheduler) scheduleToServer(modelName string) (*coordinator.Serve } } - err = s.store.UpdateLoadedModels(modelName, latestModel.GetVersion(), server, []*store.ServerReplica{}) + err = s.store.UpdateLoadedModels(modelName, latestModel.GetVersion(), server, []*db.ServerReplica{}) if err != nil { logger.WithError(err).WithField("server", server).Warn("Failed to unschedule model replicas from server") } @@ -192,7 +193,7 @@ func (s *SimpleScheduler) scheduleToServer(modelName string) (*coordinator.Serve } // Model needs to be (re)scheduled - var filteredServers []*store.ServerSnapshot + var filteredServers []*db.Server // Get all servers servers, err := s.store.GetServers(false, true) @@ -205,14 +206,14 @@ func (s *SimpleScheduler) scheduleToServer(modelName string) (*coordinator.Serve if len(filteredServers) == 0 { msg := "Failed to schedule model as no matching servers are available" logger.Warn(msg) - if err := s.store.FailedScheduling(latestModel.Key(), latestModel.GetVersion(), msg, !latestModel.HasLiveReplicas()); err != nil { + if err := s.store.FailedScheduling(latestModel.ModelName(), latestModel.GetVersion(), msg, !latestModel.HasLiveReplicas()); err != nil { return nil, fmt.Errorf("%s: got error making model as failed in memory store: %w", msg, err) } return nil, errors.New(msg) } desiredReplicas := latestModel.DesiredReplicas() - minReplicas := latestModel.GetDeploymentSpec().GetMinReplicas() + minReplicas := latestModel.ModelDefn.DeploymentSpec.GetMinReplicas() s.sortServers(latestModel, filteredServers) logger. @@ -251,7 +252,7 @@ func (s *SimpleScheduler) scheduleToServer(modelName string) (*coordinator.Serve // for example in the case that a model is just marked as loading on a particular server replica // then it gets a delete request (before it is marked as loaded or available) we need to make sure // that we can unload it from the server - if err := s.store.FailedScheduling(latestModel.Key(), + if err := s.store.FailedScheduling(latestModel.ModelName(), latestModel.GetVersion(), msg, !latestModel.HasLiveReplicas() && !latestModel.IsLoadingOrLoadedOnServer()); err != nil { return nil, fmt.Errorf("%s: got error making model as failed in memory store: %w", msg, err) @@ -264,8 +265,8 @@ func (s *SimpleScheduler) scheduleToServer(modelName string) (*coordinator.Serve return serverEvent, nil } -func (s *SimpleScheduler) findAndUpdateToServers(filteredServers []*store.ServerSnapshot, latestModel *store.ModelVersion, desiredReplicas, desiredMinReplicas int) bool { - modelName := latestModel.GetMeta().GetName() +func (s *SimpleScheduler) findAndUpdateToServers(filteredServers []*db.Server, latestModel *db.ModelVersion, desiredReplicas, desiredMinReplicas int) bool { + modelName := latestModel.ModelName() logger := s.logger.WithField("func", "findAndUpdateToServers").WithField("model", modelName) ok := false @@ -314,7 +315,7 @@ func (s *SimpleScheduler) findAndUpdateToServers(filteredServers []*store.Server return ok } -func showServerSlice(servers []*store.ServerSnapshot) string { +func showServerSlice(servers []*db.Server) string { var sb strings.Builder for idx, server := range servers { if idx > 0 { @@ -325,27 +326,27 @@ func showServerSlice(servers []*store.ServerSnapshot) string { return sb.String() } -func (s *SimpleScheduler) sortServers(model *store.ModelVersion, server []*store.ServerSnapshot) { +func (s *SimpleScheduler) sortServers(model *db.ModelVersion, server []*db.Server) { logger := s.logger.WithField("func", "sortServers") for _, sorter := range s.serverSorts { - logger.Debugf("About to sort servers for %s:%d with %s: %s", model.Key(), model.GetVersion(), sorter.Name(), showServerSlice(server)) + logger.Debugf("About to sort servers for %s:%d with %s: %s", model.ModelName(), model.GetVersion(), sorter.Name(), showServerSlice(server)) sort.SliceStable(server, func(i, j int) bool { return sorter.IsLess(&sorters.CandidateServer{Model: model, Server: server[i]}, &sorters.CandidateServer{Model: model, Server: server[j]}) }) - logger.Debugf("Sorted servers for %s:%d with %s: %s", model.Key(), model.GetVersion(), sorter.Name(), showServerSlice(server)) + logger.Debugf("Sorted servers for %s:%d with %s: %s", model.ModelName(), model.GetVersion(), sorter.Name(), showServerSlice(server)) } } -func (s *SimpleScheduler) serverScaleUp(modelVersion *store.ModelVersion) *coordinator.ServerEventMsg { +func (s *SimpleScheduler) serverScaleUp(modelVersion *db.ModelVersion) *coordinator.ServerEventMsg { logger := s.logger.WithField("func", "serverScaleUp") - if modelVersion.Server() == "" { - logger.Warnf("Empty server for %s so ignoring scale up request", modelVersion.GetMeta().Name) + if modelVersion.Server == "" { + logger.Warnf("Empty server for %s so ignoring scale up request", modelVersion.ModelName()) return nil } return &coordinator.ServerEventMsg{ - ServerName: modelVersion.Server(), + ServerName: modelVersion.Server, UpdateContext: coordinator.SERVER_SCALE_UP, } } @@ -356,7 +357,7 @@ func showReplicaSlice(candidateServer *sorters.CandidateServer) string { if idx > 0 { sb.WriteString(",") } - sb.WriteString(strconv.Itoa(replica.GetReplicaIdx())) + sb.WriteString(strconv.Itoa(int(replica.GetReplicaIdx()))) sb.WriteString(":") sb.WriteString(replica.GetInferenceSvc()) } @@ -366,21 +367,21 @@ func showReplicaSlice(candidateServer *sorters.CandidateServer) string { func (s *SimpleScheduler) sortReplicas(candidateServer *sorters.CandidateServer) { logger := s.logger.WithField("func", "sortReplicas") for _, sorter := range s.replicaSorts { - logger.Debugf("About to sort replicas for %s:%d with %s: %s", candidateServer.Model.Key(), candidateServer.Model.GetVersion(), sorter.Name(), showReplicaSlice(candidateServer)) + logger.Debugf("About to sort replicas for %s:%d with %s: %s", candidateServer.Model.ModelName(), candidateServer.Model.GetVersion(), sorter.Name(), showReplicaSlice(candidateServer)) sort.SliceStable(candidateServer.ChosenReplicas, func(i, j int) bool { return sorter.IsLess(&sorters.CandidateReplica{Model: candidateServer.Model, Server: candidateServer.Server, Replica: candidateServer.ChosenReplicas[i]}, &sorters.CandidateReplica{Model: candidateServer.Model, Server: candidateServer.Server, Replica: candidateServer.ChosenReplicas[j]}) }) - logger.Debugf("Sorted replicas for %s:%d with %s: %s", candidateServer.Model.Key(), candidateServer.Model.GetVersion(), sorter.Name(), showReplicaSlice(candidateServer)) + logger.Debugf("Sorted replicas for %s:%d with %s: %s", candidateServer.Model.ModelName(), candidateServer.Model.GetVersion(), sorter.Name(), showReplicaSlice(candidateServer)) } } // Filter servers for this model -func (s *SimpleScheduler) filterServers(model *store.ModelVersion, servers []*store.ServerSnapshot) []*store.ServerSnapshot { - logger := s.logger.WithField("func", "filterServer").WithField("model", model.GetMeta().GetName()) +func (s *SimpleScheduler) filterServers(model *db.ModelVersion, servers []*db.Server) []*db.Server { + logger := s.logger.WithField("func", "filterServer").WithField("model", model.ModelDefn.Meta.Name) logger.WithField("num_servers", len(servers)).Debug("Filtering servers for model") - var filteredServers []*store.ServerSnapshot + var filteredServers []*db.Server for _, server := range servers { ok := true for _, serverFilter := range s.serverFilters { @@ -405,10 +406,10 @@ func (s *SimpleScheduler) filterServers(model *store.ModelVersion, servers []*st return filteredServers } -func (s *SimpleScheduler) filterReplicas(model *store.ModelVersion, server *store.ServerSnapshot) *sorters.CandidateServer { +func (s *SimpleScheduler) filterReplicas(model *db.ModelVersion, server *db.Server) *sorters.CandidateServer { logger := s.logger. WithField("func", "filterReplicas"). - WithField("model", model.GetMeta().GetName()). + WithField("model", model.ModelDefn.Meta.Name). WithField("server", server.Name) logger.Debug("Filtering server replicas for model") diff --git a/scheduler/pkg/scheduler/scheduler_test.go b/scheduler/pkg/scheduler/scheduler_test.go index 56fc4e6647..4f789dcd92 100644 --- a/scheduler/pkg/scheduler/scheduler_test.go +++ b/scheduler/pkg/scheduler/scheduler_test.go @@ -43,7 +43,7 @@ type mockStore struct { unloadedModels map[string]uint32 } -var _ store.ModelStore = (*mockStore)(nil) +var _ store.ModelServerAPI = (*mockStore)(nil) func (f mockStore) FailedScheduling(modelID string, version uint32, reason string, reset bool) error { return nil diff --git a/scheduler/pkg/scheduler/sorters/interface.go b/scheduler/pkg/scheduler/sorters/interface.go index 04fd99d40b..c65e9f45d8 100644 --- a/scheduler/pkg/scheduler/sorters/interface.go +++ b/scheduler/pkg/scheduler/sorters/interface.go @@ -9,18 +9,20 @@ the Change License after the Change Date as each is defined in accordance with t package sorters -import "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" +import ( + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" +) type CandidateServer struct { - Model *store.ModelVersion - Server *store.ServerSnapshot - ChosenReplicas []*store.ServerReplica + Model *db.ModelVersion + Server *db.Server + ChosenReplicas []*db.ServerReplica } type CandidateReplica struct { - Model *store.ModelVersion - Server *store.ServerSnapshot - Replica *store.ServerReplica + Model *db.ModelVersion + Server *db.Server + Replica *db.ServerReplica } type ServerSorter interface { diff --git a/scheduler/pkg/scheduler/sorters/loaded.go b/scheduler/pkg/scheduler/sorters/loaded.go index 0991ca522c..b3a7e282ac 100644 --- a/scheduler/pkg/scheduler/sorters/loaded.go +++ b/scheduler/pkg/scheduler/sorters/loaded.go @@ -16,8 +16,8 @@ func (m ModelAlreadyLoadedSorter) Name() string { } func (m ModelAlreadyLoadedSorter) IsLess(i *CandidateReplica, j *CandidateReplica) bool { - iIsLoading := i.Model.IsLoadingOrLoaded(i.Server.Name, i.Replica.GetReplicaIdx()) - jIsLoading := j.Model.IsLoadingOrLoaded(j.Server.Name, j.Replica.GetReplicaIdx()) + iIsLoading := i.Model.IsLoadingOrLoaded(i.Server.Name, int(i.Replica.GetReplicaIdx())) + jIsLoading := j.Model.IsLoadingOrLoaded(j.Server.Name, int(j.Replica.GetReplicaIdx())) return iIsLoading && !jIsLoading } @@ -30,5 +30,5 @@ func (m ModelAlreadyLoadedOnServerSorter) Name() string { } func (m ModelAlreadyLoadedOnServerSorter) IsLess(i *CandidateServer, j *CandidateServer) bool { - return i.Model.Server() == i.Server.Name + return i.Model.Server == i.Server.Name } diff --git a/scheduler/pkg/server/server.go b/scheduler/pkg/server/server.go index 66aface4f8..3c81f7b3e7 100644 --- a/scheduler/pkg/server/server.go +++ b/scheduler/pkg/server/server.go @@ -19,16 +19,15 @@ import ( "sync" "time" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/health" + pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + seldontls "github.com/seldonio/seldon-core/components/tls/v2/pkg/tls" log "github.com/sirupsen/logrus" "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" "google.golang.org/grpc" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" - "google.golang.org/protobuf/types/known/timestamppb" - - "github.com/seldonio/seldon-core/apis/go/v2/mlops/health" - pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" - seldontls "github.com/seldonio/seldon-core/components/tls/v2/pkg/tls" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" cr "github.com/seldonio/seldon-core/scheduler/v2/pkg/kafka/conflict-resolution" @@ -64,7 +63,7 @@ type SchedulerServer struct { pb.UnimplementedSchedulerServer health.UnimplementedHealthCheckServiceServer logger log.FieldLogger - modelStore store.ModelStore + modelStore store.ModelServerAPI experimentServer experiment.ExperimentServer pipelineHandler pipeline.PipelineHandler scheduler scheduler2.Scheduler @@ -104,7 +103,7 @@ type SchedulerServerConfig struct { type ModelEventStream struct { mu sync.Mutex streams map[pb.Scheduler_SubscribeModelStatusServer]*ModelSubscription - conflictResolutioner *cr.ConflictResolutioner[store.ModelState] + conflictResolutioner *cr.ConflictResolutioner[db.ModelState] } type ServerEventStream struct { @@ -270,7 +269,7 @@ func getEnVar(logger *log.Entry, key string, defaultValue int) int { func NewSchedulerServer( logger log.FieldLogger, - modelStore store.ModelStore, + modelStore store.ModelServerAPI, experiementServer experiment.ExperimentServer, pipelineHandler pipeline.PipelineHandler, scheduler scheduler2.Scheduler, @@ -302,7 +301,7 @@ func NewSchedulerServer( scheduler: scheduler, modelEventStream: ModelEventStream{ streams: make(map[pb.Scheduler_SubscribeModelStatusServer]*ModelSubscription), - conflictResolutioner: cr.NewConflictResolution[store.ModelState](logger), + conflictResolutioner: cr.NewConflictResolution[db.ModelState](logger), }, serverEventStream: ServerEventStream{ streams: make(map[pb.Scheduler_SubscribeServerStatusServer]*ServerSubscription), @@ -470,7 +469,7 @@ func (s *SchedulerServer) HealthCheck(_ context.Context, _ *health.HealthCheckRe func (s *SchedulerServer) rescheduleModels(serverKey string) { logger := s.logger.WithField("func", "rescheduleModels") - server, err := s.modelStore.GetServer(serverKey, false, true) + server, _, err := s.modelStore.GetServer(serverKey, false, true) if err != nil { logger.WithError(err).Errorf("Failed to get server %s", serverKey) return @@ -521,41 +520,41 @@ func (s *SchedulerServer) UnloadModel(ctx context.Context, req *pb.UnloadModelRe return &pb.UnloadModelResponse{}, nil } -func createModelVersionStatus(mv *store.ModelVersion) *pb.ModelVersionStatus { +func createModelVersionStatus(mv *db.ModelVersion) *pb.ModelVersionStatus { stateMap := make(map[int32]*pb.ModelReplicaStatus) for k, v := range mv.ReplicaState() { stateMap[int32(k)] = &pb.ModelReplicaStatus{ State: pb.ModelReplicaStatus_ModelReplicaState(pb.ModelReplicaStatus_ModelReplicaState_value[v.State.String()]), Reason: v.Reason, - LastChangeTimestamp: timestamppb.New(v.Timestamp), + LastChangeTimestamp: v.Timestamp, } } - modelState := mv.ModelState() + modelState := mv.State mvs := &pb.ModelVersionStatus{ Version: mv.GetVersion(), - ServerName: mv.Server(), + ServerName: mv.Server, ModelReplicaState: stateMap, State: &pb.ModelStatus{ State: pb.ModelStatus_ModelState(pb.ModelStatus_ModelState_value[modelState.State.String()]), ModelGwState: pb.ModelStatus_ModelState(pb.ModelStatus_ModelState_value[modelState.ModelGwState.String()]), Reason: modelState.Reason, ModelGwReason: modelState.ModelGwReason, - LastChangeTimestamp: timestamppb.New(modelState.Timestamp), + LastChangeTimestamp: modelState.Timestamp, AvailableReplicas: modelState.AvailableReplicas, UnavailableReplicas: modelState.UnavailableReplicas, }, - ModelDefn: mv.GetModel(), + ModelDefn: mv.ModelDefn, } - if mv.GetMeta().KubernetesMeta != nil { - mvs.KubernetesMeta = mv.GetModel().GetMeta().GetKubernetesMeta() + if mv.ModelDefn.Meta.KubernetesMeta != nil { + mvs.KubernetesMeta = mv.ModelDefn.Meta.KubernetesMeta } return mvs } -func (s *SchedulerServer) modelStatusImpl(model *store.ModelSnapshot, allVersions bool) (*pb.ModelStatusResponse, error) { +func (s *SchedulerServer) modelStatusImpl(model *db.Model, allVersions bool) (*pb.ModelStatusResponse, error) { var modelVersionStatuses []*pb.ModelVersionStatus if !allVersions { - latestModel := model.GetLatest() + latestModel := model.Latest() if latestModel == nil { return nil, status.Errorf(codes.FailedPrecondition, "Failed to find model %s", model.Name) } @@ -669,7 +668,7 @@ func (s *SchedulerServer) ServerStatus( return nil } else { // Single server requested - server, err := s.modelStore.GetServer(req.GetName(), true, true) + server, _, err := s.modelStore.GetServer(req.GetName(), true, true) if err != nil { return status.Errorf(codes.FailedPrecondition, "%s", err.Error()) } @@ -689,7 +688,7 @@ func (s *SchedulerServer) ServerStatus( } } -func createServerStatusUpdateResponse(s *store.ServerSnapshot) *pb.ServerStatusResponse { +func createServerStatusUpdateResponse(s *db.Server) *pb.ServerStatusResponse { // note we dont count draining replicas in available replicas resp := &pb.ServerStatusResponse{ @@ -724,7 +723,7 @@ func createServerStatusUpdateResponse(s *store.ServerSnapshot) *pb.ServerStatusR return resp } -func createServerScaleResponse(s *store.ServerSnapshot, expectedReplicas uint32) *pb.ServerStatusResponse { +func createServerScaleResponse(s *db.Server, expectedReplicas uint32) *pb.ServerStatusResponse { // we dont care about populating the other fields as they should not be used by the controller, reconsider if this changes resp := &pb.ServerStatusResponse{ diff --git a/scheduler/pkg/server/server_status.go b/scheduler/pkg/server/server_status.go index a335cfed5a..89c1276f5c 100644 --- a/scheduler/pkg/server/server_status.go +++ b/scheduler/pkg/server/server_status.go @@ -14,13 +14,13 @@ import ( "fmt" "time" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/sirupsen/logrus" pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" cr "github.com/seldonio/seldon-core/scheduler/v2/pkg/kafka/conflict-resolution" - "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" "github.com/seldonio/seldon-core/scheduler/v2/pkg/util" ) @@ -31,16 +31,16 @@ const ( // pollerRetryFailedCreateModels will retry creating models on model-gw which failed to load. Most likely // due to connectivity issues with kafka. func (s *SchedulerServer) pollerRetryFailedCreateModels(ctx context.Context, tick time.Duration, maxRetry uint) { - s.pollerRetryFailedModels(ctx, tick, "pollerRetryFailedCreateModels", store.ModelFailed, "create", maxRetry) + s.pollerRetryFailedModels(ctx, tick, "pollerRetryFailedCreateModels", db.ModelState_MODEL_STATE_FAILED, "create", maxRetry) } // pollerRetryFailedDeleteModels will retry deleting models on model-gw which failed to terminate. Most likely // due to connectivity issues with kafka. func (s *SchedulerServer) pollerRetryFailedDeleteModels(ctx context.Context, tick time.Duration, maxRetry uint) { - s.pollerRetryFailedModels(ctx, tick, "pollerRetryFailedDeleteModels", store.ModelTerminateFailed, "delete", maxRetry) + s.pollerRetryFailedModels(ctx, tick, "pollerRetryFailedDeleteModels", db.ModelState_MODEL_STATE_TERMINATE_FAILED, "delete", maxRetry) } -func (s *SchedulerServer) pollerRetryFailedModels(ctx context.Context, tick time.Duration, funcName string, targetState store.ModelState, operation string, maxRetry uint) { +func (s *SchedulerServer) pollerRetryFailedModels(ctx context.Context, tick time.Duration, funcName string, targetState db.ModelState, operation string, maxRetry uint) { logger := s.logger.WithField("func", funcName) ticker := time.NewTicker(tick) defer ticker.Stop() @@ -50,7 +50,11 @@ func (s *SchedulerServer) pollerRetryFailedModels(ctx context.Context, tick time case <-ctx.Done(): return case <-ticker.C: - models := s.getModelsInGwRetryState(logger, targetState, operation, maxRetry) + models, err := s.getModelsInGwRetryState(logger, targetState, operation, maxRetry) + if err != nil { + logger.WithError(err).Errorf("Failed getting models") + continue + } if len(models) > 0 { s.modelGwRebalanceForModels(models) } @@ -62,11 +66,14 @@ func (s *SchedulerServer) mkModelRetryKey(modelName string, version uint32) stri return fmt.Sprintf("%s_%d", modelName, version) } -func (s *SchedulerServer) getModelsInGwRetryState(logger *logrus.Entry, targetState store.ModelState, operation string, maxRetry uint) []*store.ModelSnapshot { - modelNames := s.modelStore.GetAllModels() +func (s *SchedulerServer) getModelsInGwRetryState(logger *logrus.Entry, targetState db.ModelState, operation string, maxRetry uint) ([]*db.Model, error) { + modelNames, err := s.modelStore.GetAllModels() + if err != nil { + return nil, err + } logger.WithField("models", modelNames).Debugf("Poller retry to %s failed models on model-gw", operation) - models := make([]*store.ModelSnapshot, 0) + models := make([]*db.Model, 0) for _, modelName := range modelNames { model, err := s.modelStore.GetModel(modelName) @@ -75,18 +82,18 @@ func (s *SchedulerServer) getModelsInGwRetryState(logger *logrus.Entry, targetSt continue } - if model.GetLatest() == nil { + if model.Latest() == nil { logger.Warnf("Model %s has no versions, skipping", modelName) continue } - modelGwState := model.GetLatest().ModelState().ModelGwState + modelGwState := model.Latest().State.ModelGwState if modelGwState != targetState { logger.Debugf("Model-gw model %s state %s != %s, skipping", modelName, modelGwState, targetState) continue } - key := s.mkModelRetryKey(model.Name, model.GetLatest().GetVersion()) + key := s.mkModelRetryKey(model.Name, model.Latest().GetVersion()) s.muRetriedFailedModels.Lock() s.retriedFailedModels[key]++ if s.retriedFailedModels[key] > maxRetry { @@ -100,7 +107,7 @@ func (s *SchedulerServer) getModelsInGwRetryState(logger *logrus.Entry, targetSt models = append(models, model) } - return models + return models, nil } func (s *SchedulerServer) resetModelRetryCount(msg *pb.ModelUpdateMessage) { @@ -121,21 +128,21 @@ func (s *SchedulerServer) ModelStatusEvent(_ context.Context, message *pb.ModelU logger := s.logger.WithField("func", "ModelStatusEvent") - var statusVal store.ModelState + var statusVal db.ModelState switch message.Update.Op { case pb.ModelUpdateMessage_Create: if message.Success { s.resetModelRetryCount(message.Update) - statusVal = store.ModelAvailable + statusVal = db.ModelState_MODEL_STATE_AVAILABLE } else { - statusVal = store.ModelFailed + statusVal = db.ModelState_MODEL_STATE_FAILED } case pb.ModelUpdateMessage_Delete: if message.Success { s.removeModelRetryCount(message.Update) - statusVal = store.ModelTerminated + statusVal = db.ModelState_MODEL_STATE_TERMINATED } else { - statusVal = store.ModelTerminateFailed + statusVal = db.ModelState_MODEL_STATE_TERMINATE_FAILED } } @@ -155,7 +162,7 @@ func (s *SchedulerServer) ModelStatusEvent(_ context.Context, message *pb.ModelU confRes.UpdateStatus(modelName, stream, statusVal) modelStatusVal, reason := cr.GetModelStatus(confRes, modelName, message) - if modelStatusVal == store.ModelTerminated { + if modelStatusVal == db.ModelState_MODEL_STATE_TERMINATED { confRes.Delete(modelName) } @@ -233,7 +240,10 @@ func (s *SchedulerServer) sendCurrentModelStatuses(stream pb.Scheduler_Subscribe s.modelEventStream.mu.Lock() defer s.modelEventStream.mu.Unlock() - modelNames := s.modelStore.GetAllModels() + modelNames, err := s.modelStore.GetAllModels() + if err != nil { + return err + } for _, modelName := range modelNames { model, err := s.modelStore.GetModel(modelName) if err != nil { @@ -267,17 +277,20 @@ func contains(slice []string, val string) bool { return false } -func (s *SchedulerServer) allPermittedModels() []*store.ModelSnapshot { - var permittedModels []*store.ModelSnapshot - modelNames := s.modelStore.GetAllModels() +func (s *SchedulerServer) allPermittedModels() ([]*db.Model, error) { + var permittedModels []*db.Model + modelNames, err := s.modelStore.GetAllModels() + if err != nil { + return nil, fmt.Errorf("failed to get all models: %w", err) + } - allowedModelGwStates := map[store.ModelState]struct{}{ - store.ModelCreate: {}, - store.ModelProgressing: {}, - store.ModelAvailable: {}, - store.ModelTerminating: {}, + allowedModelGwStates := map[db.ModelState]struct{}{ + db.ModelState_MODEL_STATE_CREATE: {}, + db.ModelState_MODEL_STATE_PROGRESSING: {}, + db.ModelState_MODEL_STATE_AVAILABLE: {}, + db.ModelState_MODEL_STATE_TERMINATING: {}, // we want to retry models which failed to create on model-gw i.e. likely kafka connectivity issues - store.ModelFailed: {}, + db.ModelState_MODEL_STATE_FAILED: {}, } for _, modelName := range modelNames { @@ -286,20 +299,20 @@ func (s *SchedulerServer) allPermittedModels() []*store.ModelSnapshot { s.logger.WithError(err).Errorf("Failed to get model %s for running models", modelName) continue } - if model.GetLatest() == nil { + if model.Latest() == nil { s.logger.Warnf("Model %s has no versions, skipping running models", modelName) continue } - if _, ok := allowedModelGwStates[model.GetLatest().ModelState().ModelGwState]; ok { + if _, ok := allowedModelGwStates[model.Latest().State.ModelGwState]; ok { permittedModels = append(permittedModels, model) } } - return permittedModels + return permittedModels, nil } -func (s *SchedulerServer) createModelDeletionMessage(model *store.ModelSnapshot, keepTopics bool) (*pb.ModelStatusResponse, error) { +func (s *SchedulerServer) createModelDeletionMessage(model *db.Model, keepTopics bool) (*pb.ModelStatusResponse, error) { ms, err := s.modelStatusImpl(model, false) if err != nil { return nil, err @@ -309,7 +322,7 @@ func (s *SchedulerServer) createModelDeletionMessage(model *store.ModelSnapshot, return ms, nil } -func (s *SchedulerServer) createModelCreationMessage(model *store.ModelSnapshot) (*pb.ModelStatusResponse, error) { +func (s *SchedulerServer) createModelCreationMessage(model *db.Model) (*pb.ModelStatusResponse, error) { ms, err := s.modelStatusImpl(model, false) if err != nil { return nil, err @@ -319,12 +332,15 @@ func (s *SchedulerServer) createModelCreationMessage(model *store.ModelSnapshot) } func (s *SchedulerServer) modelGwRebalance() { - runningModels := s.allPermittedModels() + runningModels, err := s.allPermittedModels() + if err != nil { + s.logger.WithError(err).Error("Failed to run gw rebalance") + } s.logger.Debugf("Rebalancing model gateways for running models: %v", runningModels) s.modelGwRebalanceForModels(runningModels) } -func (s *SchedulerServer) modelGwRebalanceForModels(models []*store.ModelSnapshot) { +func (s *SchedulerServer) modelGwRebalanceForModels(models []*db.Model) { s.modelEventStream.mu.Lock() defer s.modelEventStream.mu.Unlock() @@ -346,11 +362,11 @@ func (s *SchedulerServer) modelGwRebalanceForModels(models []*store.ModelSnapsho } } -func (s *SchedulerServer) modelGwRebalanceNoStream(model *store.ModelSnapshot) { - modelState := store.ModelCreate - if model.GetLatest().ModelState().ModelGwState == store.ModelTerminating || - model.GetLatest().ModelState().ModelGwState == store.ModelTerminateFailed { - modelState = store.ModelTerminated +func (s *SchedulerServer) modelGwRebalanceNoStream(model *db.Model) { + modelState := db.ModelState_MODEL_STATE_CREATE + if model.Latest().State.ModelGwState == db.ModelState_MODEL_STATE_TERMINATING || + model.Latest().State.ModelGwState == db.ModelState_MODEL_STATE_TERMINATE_FAILED { + modelState = db.ModelState_MODEL_STATE_TERMINATED } s.logger.Debugf( @@ -360,7 +376,7 @@ func (s *SchedulerServer) modelGwRebalanceNoStream(model *store.ModelSnapshot) { if err := s.modelStore.SetModelGwModelState( model.Name, - model.GetLatest().GetVersion(), + model.Latest().GetVersion(), modelState, "No model gateway available to handle model", modelStatusEventSource, @@ -369,7 +385,7 @@ func (s *SchedulerServer) modelGwRebalanceNoStream(model *store.ModelSnapshot) { } } -func (s *SchedulerServer) modelGwReblanceStreams(model *store.ModelSnapshot) { +func (s *SchedulerServer) modelGwReblanceStreams(model *db.Model) { consumerBucketId := s.getModelGatewayBucketId(model.Name) s.logger.Debugf("Rebalancing model %s with consumber bucket id %s", model.Name, consumerBucketId) @@ -392,11 +408,11 @@ func (s *SchedulerServer) modelGwReblanceStreams(model *store.ModelSnapshot) { if contains(servers, server) { s.logger.Debug("Server contains model, sending status update for: ", server) - state := model.GetLatest().ModelState().ModelGwState + state := model.Latest().State.ModelGwState var msg *pb.ModelStatusResponse var err error - if state == store.ModelTerminating || state == store.ModelTerminateFailed { + if state == db.ModelState_MODEL_STATE_TERMINATING || state == db.ModelState_MODEL_STATE_TERMINATE_FAILED { s.logger.Debugf("Model %s in state %s, sending deletion message", model.Name, state) msg, err = s.createModelDeletionMessage(model, false) } else { @@ -406,8 +422,8 @@ func (s *SchedulerServer) modelGwReblanceStreams(model *store.ModelSnapshot) { // set modelgw state to progressing and display rebalance reason if err := s.modelStore.SetModelGwModelState( model.Name, - model.GetLatest().GetVersion(), - store.ModelProgressing, + model.Latest().GetVersion(), + db.ModelState_MODEL_STATE_PROGRESSING, "Rebalance", modelStatusEventSource, ); err != nil { @@ -507,13 +523,13 @@ func (s *SchedulerServer) sendModelStatusEvent(evt coordinator.ModelEventMsg) er return err } - if model.GetLatest() == nil { + if model.Latest() == nil { logger.Warnf("Failed to find latest model version for %s so ignoring event", evt.String()) return nil } - if model.GetLatest().GetVersion() != evt.ModelVersion { - logger.Warnf("Latest model version %d does not match event version %d for %s so ignoring event", model.GetLatest().GetVersion(), evt.ModelVersion, evt.String()) + if model.Latest().GetVersion() != evt.ModelVersion { + logger.Warnf("Latest model version %d does not match event version %d for %s so ignoring event", model.Latest().GetVersion(), evt.ModelVersion, evt.String()) return nil } @@ -547,20 +563,20 @@ func (s *SchedulerServer) sendModelStatusEvent(evt coordinator.ModelEventMsg) er return nil } - modelState := model.GetLatest().ModelState() - if len(modelGwStreams) == 0 && modelState.ModelGwState != store.ModelTerminated { + modelState := model.Latest().State + if len(modelGwStreams) == 0 && modelState.ModelGwState != db.ModelState_MODEL_STATE_TERMINATED { // handle case where we don't have any model-gateway streams errMsg := "No model gateway available to handle model" logger.WithField("model", model.Name).Warn(errMsg) modelGwState := modelState.ModelGwState - if modelState.ModelGwState == store.ModelTerminate || modelState.ModelGwState == store.ModelTerminating { - modelGwState = store.ModelTerminated + if modelState.ModelGwState == db.ModelState_MODEL_STATE_TERMINATE || modelState.ModelGwState == db.ModelState_MODEL_STATE_TERMINATING { + modelGwState = db.ModelState_MODEL_STATE_TERMINATED } if err := s.modelStore.SetModelGwModelState( model.Name, - model.GetLatest().GetVersion(), + model.Latest().GetVersion(), modelGwState, errMsg, modelStatusEventSource, @@ -575,12 +591,12 @@ func (s *SchedulerServer) sendModelStatusEvent(evt coordinator.ModelEventMsg) er } switch modelState.ModelGwState { - case store.ModelCreate: + case db.ModelState_MODEL_STATE_CREATE: logger.Debugf("Model %s is in create state, sending creation message", model.Name) if err := s.modelStore.SetModelGwModelState( model.Name, - model.GetLatest().GetVersion(), - store.ModelProgressing, + model.Latest().GetVersion(), + db.ModelState_MODEL_STATE_PROGRESSING, "Model is being loaded onto model gateway", modelStatusEventSource, ); err != nil { @@ -598,12 +614,12 @@ func (s *SchedulerServer) sendModelStatusEvent(evt coordinator.ModelEventMsg) er // send message to model gateway streams s.sendModelStatusEventToStreamsWithTimestamp(evt, ms, modelGwStreams) - case store.ModelTerminate: + case db.ModelState_MODEL_STATE_TERMINATE: logger.Debugf("Model %s is in terminate state, sending deletion message", model.Name) if err := s.modelStore.SetModelGwModelState( model.Name, - model.GetLatest().GetVersion(), - store.ModelTerminating, + model.Latest().GetVersion(), + db.ModelState_MODEL_STATE_TERMINATING, "Model is being unloaded from model gateway", modelStatusEventSource, ); err != nil { @@ -694,7 +710,7 @@ func (s *SchedulerServer) handleServerEvents(event coordinator.ServerEventMsg) { logger := s.logger.WithField("func", "handleServerEvents") logger.Debugf("Got server state %s change for %s", event.ServerName, event.String()) - server, err := s.modelStore.GetServer(event.ServerName, true, true) + server, stats, err := s.modelStore.GetServer(event.ServerName, true, true) if err != nil { logger.WithError(err).Errorf("Failed to get server %s", event.ServerName) return @@ -702,12 +718,12 @@ func (s *SchedulerServer) handleServerEvents(event coordinator.ServerEventMsg) { if s.config.AutoScalingServerEnabled { if event.UpdateContext == coordinator.SERVER_SCALE_DOWN { - if ok, replicas := shouldScaleDown(server, float32(s.config.PackThreshold)); ok { + if ok, replicas := shouldScaleDown(server, stats, float32(s.config.PackThreshold)); ok { logger.Infof("Server %s is scaling down to %d", event.ServerName, replicas) s.sendServerScale(server, replicas) } } else if event.UpdateContext == coordinator.SERVER_SCALE_UP { - if ok, replicas := shouldScaleUp(server); ok { + if ok, replicas := shouldScaleUp(server, stats); ok { logger.Infof("Server %s is scaling up to %d", event.ServerName, replicas) s.sendServerScale(server, replicas) } @@ -735,14 +751,14 @@ func (s *SchedulerServer) updateServerModelsStatus(evt coordinator.ModelEventMsg logger.Warnf("Failed to find model version %s so ignoring event", evt.String()) return nil } - if modelVersion.Server() == "" { + if modelVersion.Server == "" { logger.Warnf("Empty server for %s so ignoring event", evt.String()) return nil } s.serverEventStream.pendingLock.Lock() // we are coalescing events so we only send one event (the latest status) per server - s.serverEventStream.pendingEvents[modelVersion.Server()] = struct{}{} + s.serverEventStream.pendingEvents[modelVersion.Server] = struct{}{} if s.serverEventStream.trigger == nil { s.serverEventStream.trigger = time.AfterFunc(defaultBatchWait, s.sendServerStatus) } @@ -765,7 +781,7 @@ func (s *SchedulerServer) sendServerStatus() { s.serverEventStream.mu.Lock() defer s.serverEventStream.mu.Unlock() for serverName := range pendingServers { - server, err := s.modelStore.GetServer(serverName, true, true) + server, _, err := s.modelStore.GetServer(serverName, true, true) if err != nil { logger.Errorf("Failed to get server %s", serverName) continue @@ -775,7 +791,7 @@ func (s *SchedulerServer) sendServerStatus() { } } -func (s *SchedulerServer) sendServerScale(server *store.ServerSnapshot, expectedReplicas uint32) { +func (s *SchedulerServer) sendServerScale(server *db.Server, expectedReplicas uint32) { // TODO: should there be some sort of velocity check ? logger := s.logger.WithField("func", "sendServerScale") logger.Debugf("will attempt to scale servers to %d for %v", expectedReplicas, server.Name) diff --git a/scheduler/pkg/server/utils.go b/scheduler/pkg/server/utils.go index ee63e69f49..3c358a7deb 100644 --- a/scheduler/pkg/server/utils.go +++ b/scheduler/pkg/server/utils.go @@ -15,6 +15,7 @@ import ( "math/rand/v2" "time" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" @@ -42,21 +43,20 @@ func sendWithTimeout(f func() error, d time.Duration) (bool, error) { } } -func shouldScaleUp(server *store.ServerSnapshot) (bool, uint32) { +func shouldScaleUp(server *db.Server, stats *store.ServerStats) (bool, uint32) { if server.ExpectedReplicas < 0 || server.MaxReplicas < 1 { return false, 0 } - if server.Stats != nil { - maxNumReplicaHostedModels := server.Stats.MaxNumReplicaHostedModels + if stats != nil { + maxNumReplicaHostedModels := stats.MaxNumReplicaHostedModels return maxNumReplicaHostedModels > uint32(server.ExpectedReplicas), min(maxNumReplicaHostedModels, uint32(server.MaxReplicas)) } return false, 0 } -func shouldScaleDown(server *store.ServerSnapshot, perc float32) (bool, uint32) { +func shouldScaleDown(server *db.Server, stats *store.ServerStats, perc float32) (bool, uint32) { - if server.Stats != nil { - stats := server.Stats + if stats != nil { currentReplicas := uint32(server.ExpectedReplicas) minReplicas := uint32(server.MinReplicas) if minReplicas == 0 { diff --git a/scheduler/pkg/store/store.go b/scheduler/pkg/store/api.go similarity index 95% rename from scheduler/pkg/store/store.go rename to scheduler/pkg/store/api.go index cc141996b3..12a1c2b3bf 100644 --- a/scheduler/pkg/store/store.go +++ b/scheduler/pkg/store/api.go @@ -136,8 +136,8 @@ func (m *ModelSnapshot) GetVersionsBeforeLastModelGwAvailable() []*ModelVersion return nil } -//go:generate go tool mockgen -source=./store.go -destination=./mock/store.go -package=mock ModelStore -type ModelStore interface { +//go:generate go tool mockgen -source=./api.go -destination=./mock/store.go -package=mock ModelServerAPI +type ModelServerAPI interface { UpdateModel(config *pb.LoadModelRequest) error GetModel(key string) (*db.Model, error) GetModels() ([]*db.Model, error) @@ -157,5 +157,6 @@ type ModelStore interface { FailedScheduling(modelID string, version uint32, reason string, reset bool) error GetAllModels() ([]string, error) SetModelGwModelState(name string, versionNumber uint32, status db.ModelState, reason string, source string) error + // TODO better name... should it even be on tihs interface? EmitEvents() error } diff --git a/scheduler/pkg/store/experiment/store.go b/scheduler/pkg/store/experiment/store.go index df997afc25..80dadf60c2 100644 --- a/scheduler/pkg/store/experiment/store.go +++ b/scheduler/pkg/store/experiment/store.go @@ -55,12 +55,12 @@ type ExperimentStore struct { pipelineBaselines map[string]*Experiment // pipeline name to the single baseline experiment it appears in pipelineReferences map[string]map[string]*Experiment // pipeline name to experiments it appears in eventHub *coordinator.EventHub - store store.ModelStore + store store.ModelServerAPI pipelineStore pipeline.PipelineHandler db *ExperimentDBManager } -func NewExperimentServer(logger logrus.FieldLogger, eventHub *coordinator.EventHub, store store.ModelStore, pipelineStore pipeline.PipelineHandler) *ExperimentStore { +func NewExperimentServer(logger logrus.FieldLogger, eventHub *coordinator.EventHub, store store.ModelServerAPI, pipelineStore pipeline.PipelineHandler) *ExperimentStore { es := &ExperimentStore{ logger: logger.WithField("source", "experimentServer"), experiments: make(map[string]*Experiment), diff --git a/scheduler/pkg/store/experiment/store_test.go b/scheduler/pkg/store/experiment/store_test.go index fd73b7a6a0..022419c60e 100644 --- a/scheduler/pkg/store/experiment/store_test.go +++ b/scheduler/pkg/store/experiment/store_test.go @@ -452,7 +452,7 @@ type fakeModelStore struct { status map[string]store.ModelState } -var _ store.ModelStore = (*fakeModelStore)(nil) +var _ store.ModelServerAPI = (*fakeModelStore)(nil) func (f fakeModelStore) UpdateModel(config *scheduler.LoadModelRequest) error { panic("implement me") diff --git a/scheduler/pkg/store/memory.go b/scheduler/pkg/store/memory.go index e91e9453eb..49772667eb 100644 --- a/scheduler/pkg/store/memory.go +++ b/scheduler/pkg/store/memory.go @@ -26,22 +26,29 @@ import ( ) type MemoryStore struct { - mu sync.RWMutex - opLocks sync.Map - store Storage + mu sync.RWMutex + opLocks sync.Map + store struct { + models Storage[*db.Model] + servers Storage[*db.Server] + } logger log.FieldLogger eventHub *coordinator.EventHub } -var _ ModelStore = &MemoryStore{} +var _ ModelServerAPI = &MemoryStore{} func NewMemoryStore( logger log.FieldLogger, - store Storage, + modelStore Storage[*db.Model], + serverStore Storage[*db.Server], eventHub *coordinator.EventHub, ) *MemoryStore { return &MemoryStore{ - store: store, + store: struct { + models Storage[*db.Model] + servers Storage[*db.Server] + }{models: modelStore, servers: serverStore}, logger: logger.WithField("source", "MemoryStore"), eventHub: eventHub, } @@ -52,7 +59,7 @@ func (m *MemoryStore) GetAllModels() ([]string, error) { defer m.mu.RUnlock() var modelNames []string - models, err := m.store.ListModels() + models, err := m.store.models.List() if err != nil { return nil, err } @@ -66,18 +73,18 @@ func (m *MemoryStore) GetAllModels() ([]string, error) { func (m *MemoryStore) GetModels() ([]*db.Model, error) { m.mu.RLock() defer m.mu.RUnlock() - return m.store.ListModels() + return m.store.models.List() } func (m *MemoryStore) addModelVersionIfNotExists(req *agent.ModelVersion) (*db.Model, *db.ModelVersion, error) { modelName := req.GetModel().GetMeta().GetName() - model, err := m.store.GetModel(modelName) + model, err := m.store.models.Get(modelName) if err != nil { if !errors.Is(err, ErrNotFound) { return nil, nil, err } model = &db.Model{Name: modelName} - if err := m.store.AddModel(model); err != nil { + if err := m.store.models.Insert(model); err != nil { return nil, nil, err } } @@ -125,14 +132,14 @@ func (m *MemoryStore) UpdateModel(req *pb.LoadModelRequest) error { modelName, ) } - model, err := m.store.GetModel(modelName) + model, err := m.store.models.Get(modelName) if err != nil { if !errors.Is(err, ErrNotFound) { return fmt.Errorf("failed to get model %s: %w", modelName, err) } model = &db.Model{Name: modelName} m.addNextModelVersion(model, req.GetModel()) - if err := m.store.AddModel(model); err != nil { + if err := m.store.models.Insert(model); err != nil { return fmt.Errorf("failed to update model %s: %w", modelName, err) } return nil @@ -141,7 +148,7 @@ func (m *MemoryStore) UpdateModel(req *pb.LoadModelRequest) error { if model.Deleted { if model.Inactive() { m.addNextModelVersion(model, req.GetModel()) - if err := m.store.UpdateModel(model); err != nil { + if err := m.store.models.Update(model); err != nil { return fmt.Errorf("failed to update model %s: %w", modelName, err) } return nil @@ -182,7 +189,7 @@ func (m *MemoryStore) UpdateModel(req *pb.LoadModelRequest) error { } if changed { - if err := m.store.UpdateModel(model); err != nil { + if err := m.store.models.Update(model); err != nil { return fmt.Errorf("failed to update model %s: %w", modelName, err) } } @@ -222,7 +229,7 @@ func (m *MemoryStore) UnlockModel(modelId string) { func (m *MemoryStore) GetModel(key string) (*db.Model, error) { m.mu.RLock() defer m.mu.RUnlock() - model, err := m.store.GetModel(key) + model, err := m.store.models.Get(key) if err != nil { return nil, fmt.Errorf("failed to get model %s: %w", key, err) } @@ -242,7 +249,7 @@ func (m *MemoryStore) removeModelImpl(req *pb.UnloadModelRequest) error { defer m.mu.Unlock() modelName := req.GetModel().GetName() - model, err := m.store.GetModel(modelName) + model, err := m.store.models.Get(modelName) if err == nil { // Updating the k8s meta is required to be updated so status updates back (to manager) // will match latest generation value. Previous generation values might be ignored by manager. @@ -266,7 +273,7 @@ func (m *MemoryStore) GetServers(shallow bool, modelDetails bool) ([]*db.Server, m.mu.RLock() defer m.mu.RUnlock() - servers, err := m.store.ListServers() + servers, err := m.store.servers.List() if err != nil { return nil, fmt.Errorf("failed to list servers: %w", err) } @@ -278,7 +285,7 @@ func (m *MemoryStore) GetServer(serverKey string, shallow bool, modelDetails boo m.mu.RLock() defer m.mu.RUnlock() - server, err := m.store.GetServer(serverKey) + server, err := m.store.servers.Get(serverKey) if err != nil { if errors.Is(err, ErrNotFound) { return nil, nil, fmt.Errorf("server [%s] not found", serverKey) @@ -321,7 +328,7 @@ func (m *MemoryStore) getModelServer( serverKey string, ) (*db.Model, *db.ModelVersion, *db.Server, error) { // Validate - model, err := m.store.GetModel(modelKey) + model, err := m.store.models.Get(modelKey) if err != nil { return nil, nil, nil, fmt.Errorf("failed to find model %s", modelKey) } @@ -329,7 +336,7 @@ func (m *MemoryStore) getModelServer( if modelVersion == nil { return nil, nil, nil, fmt.Errorf("Version not found for model %s, version %d", modelKey, version) } - server, err := m.store.GetServer(serverKey) + server, err := m.store.servers.Get(serverKey) if err != nil { return nil, nil, nil, fmt.Errorf("failed to find server %s", serverKey) } @@ -367,7 +374,7 @@ func (m *MemoryStore) updateLoadedModelsImpl( // Validate // TODO pointer issue - model, err := m.store.GetModel(modelKey) + model, err := m.store.models.Get(modelKey) if err != nil { if errors.Is(err, ErrNotFound) { return nil, fmt.Errorf("model [%s] not found", modelKey) @@ -394,7 +401,7 @@ func (m *MemoryStore) updateLoadedModelsImpl( }, nil } - server, err := m.store.GetServer(serverKey) + server, err := m.store.servers.Get(serverKey) if err != nil { return nil, fmt.Errorf("failed to find server %s", serverKey) } @@ -503,7 +510,7 @@ func (m *MemoryStore) unloadVersionModelsImpl(modelKey string, version uint32) ( // Validate // TODO pointer issue - model, err := m.store.GetModel(modelKey) + model, err := m.store.models.Get(modelKey) if err != nil { return nil, false, fmt.Errorf("failed to find model %s", modelKey) } @@ -625,7 +632,7 @@ func (m *MemoryStore) updateModelStateImpl( desiredState == db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOADED || desiredState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_FAILED { // TODO pointer - server, err := m.store.GetServer(serverKey) + server, err := m.store.servers.Get(serverKey) // TODO handle error if err == nil { replica, ok := server.Replicas[int32(replicaIdx)] @@ -679,7 +686,7 @@ func (m *MemoryStore) updateReservedMemory( ) { // update reserved memory that is being used for sorting replicas // do we need to lock replica update? - server, err := m.store.GetServer(serverKey) + server, err := m.store.servers.Get(serverKey) // TODO handle error?? if err == nil { replica, okReplica := server.Replicas[int32(replicaIdx)] @@ -720,13 +727,13 @@ func (m *MemoryStore) addServerReplicaImpl(request *agent.AgentSubscribeRequest) defer m.mu.Unlock() // TODO pointer issue - server, err := m.store.GetServer(request.ServerName) + server, err := m.store.servers.Get(request.ServerName) if err != nil { if !errors.Is(err, ErrNotFound) { return nil, coordinator.ServerEventMsg{}, err } server = NewServer(request.ServerName, request.Shared) - if err := m.store.AddServer(server); err != nil { + if err := m.store.servers.Insert(server); err != nil { return nil, coordinator.ServerEventMsg{}, fmt.Errorf("failed to add server %s: %w", request.ServerName, err) } } @@ -787,7 +794,7 @@ func (m *MemoryStore) removeServerReplicaImpl(serverName string, replicaIdx int) m.mu.Lock() defer m.mu.Unlock() - server, err := m.store.GetServer(serverName) + server, err := m.store.servers.Get(serverName) if err != nil { return nil, nil, fmt.Errorf("failed to find server %s: %w", serverName, err) } @@ -798,7 +805,7 @@ func (m *MemoryStore) removeServerReplicaImpl(serverName string, replicaIdx int) delete(server.Replicas, int32(replicaIdx)) // TODO we should not reschedule models on servers with dedicated models, e.g. non shareable servers if len(server.Replicas) == 0 { - if err := m.store.DeleteServer(serverName); err != nil { + if err := m.store.servers.Delete(serverName); err != nil { return nil, nil, fmt.Errorf("failed to delete server %s: %w", serverName, err) } } @@ -818,7 +825,7 @@ func (m *MemoryStore) removeModelfromServerReplica(models []*db.ModelVersionID, // Find models to reschedule due to this server replica being removed for _, v := range models { // TODO pointer issue - model, err := m.store.GetModel(v.Name) + model, err := m.store.models.Get(v.Name) if err == nil { modelVersion := model.GetVersion(v.Version) if modelVersion != nil { @@ -866,7 +873,7 @@ func (m *MemoryStore) DrainServerReplica(serverName string, replicaIdx int) ([]s func (m *MemoryStore) drainServerReplicaImpl(serverName string, replicaIdx int) ([]string, error) { logger := m.logger.WithField("func", "DrainServerReplica") // TODO pointer issue - server, err := m.store.GetServer(serverName) + server, err := m.store.servers.Get(serverName) if err != nil { return nil, fmt.Errorf("Failed to find server %s", serverName) } @@ -897,7 +904,7 @@ func (m *MemoryStore) findModelsToReSchedule(models []*db.ModelVersionID, replic for _, v := range models { // TODO pointer issue // TODO handle error? - model, err := m.store.GetModel(v.Name) + model, err := m.store.models.Get(v.Name) if err == nil { modelVersion := model.GetVersion(v.Version) if modelVersion != nil { @@ -919,7 +926,7 @@ func (m *MemoryStore) ServerNotify(request *pb.ServerNotify) error { logger.Debugf("ServerNotify %v", request) - server, err := m.store.GetServer(request.Name) + server, err := m.store.servers.Get(request.Name) if err != nil { if !errors.Is(err, ErrNotFound) { return fmt.Errorf("failed to find server %s: %w", request.Name, err) @@ -929,7 +936,7 @@ func (m *MemoryStore) ServerNotify(request *pb.ServerNotify) error { server.MinReplicas = int32(request.MinReplicas) server.MaxReplicas = int32(request.MaxReplicas) server.KubernetesMeta = request.KubernetesMeta - if err := m.store.AddServer(server); err != nil { + if err := m.store.servers.Insert(server); err != nil { return err } return nil @@ -940,7 +947,7 @@ func (m *MemoryStore) ServerNotify(request *pb.ServerNotify) error { server.MaxReplicas = int32(request.MaxReplicas) server.KubernetesMeta = request.KubernetesMeta - if err := m.store.UpdateServer(server); err != nil { + if err := m.store.servers.Update(server); err != nil { return fmt.Errorf("failed to update server %s: %w", request.Name, err) } @@ -949,7 +956,7 @@ func (m *MemoryStore) ServerNotify(request *pb.ServerNotify) error { func (m *MemoryStore) numEmptyServerReplicas(serverName string) (uint32, error) { emptyReplicas := uint32(0) - server, err := m.store.GetServer(serverName) + server, err := m.store.servers.Get(serverName) if err != nil { if !errors.Is(err, ErrNotFound) { return 0, err @@ -965,7 +972,7 @@ func (m *MemoryStore) numEmptyServerReplicas(serverName string) (uint32, error) } func (m *MemoryStore) maxNumModelReplicasForServer(serverName string) (uint32, error) { - models, err := m.store.ListModels() + models, err := m.store.models.List() if err != nil { return 0, err } @@ -1016,7 +1023,7 @@ func (m *MemoryStore) setModelGwModelStateImpl(name string, versionNumber uint32 defer m.mu.Unlock() // TODO pointer issue - model, err := m.store.GetModel(name) + model, err := m.store.models.Get(name) if err != nil { return nil, fmt.Errorf("failed to find model %s", name) } @@ -1039,7 +1046,7 @@ func (m *MemoryStore) setModelGwModelStateImpl(name string, versionNumber uint32 } func (m *MemoryStore) EmitEvents() error { - servers, err := m.store.ListServers() + servers, err := m.store.servers.List() if err != nil { return err } @@ -1055,7 +1062,7 @@ func (m *MemoryStore) EmitEvents() error { } } - models, err := m.store.ListModels() + models, err := m.store.models.List() if err != nil { return err } diff --git a/scheduler/pkg/store/memory_status.go b/scheduler/pkg/store/memory_status.go index a996bfe7c0..b0e17a2e0a 100644 --- a/scheduler/pkg/store/memory_status.go +++ b/scheduler/pkg/store/memory_status.go @@ -125,7 +125,7 @@ func (m *MemoryStore) FailedScheduling(modelID string, version uint32, reason st m.mu.Lock() defer m.mu.Unlock() - model, err := m.store.GetModel(modelID) + model, err := m.store.models.Get(modelID) if err != nil { return fmt.Errorf("model %s not found: %w", modelID, err) } @@ -151,7 +151,7 @@ func (m *MemoryStore) FailedScheduling(modelID string, version uint32, reason st modelVersion.Server = "" } - if err := m.store.UpdateModel(model); err != nil { + if err := m.store.models.Update(model); err != nil { return fmt.Errorf("failed to update model %s: %w", modelID, err) } @@ -192,7 +192,7 @@ func (m *MemoryStore) UnloadModelGwVersionModels(modelKey string, version uint32 m.mu.Lock() defer m.mu.Unlock() - model, err := m.store.GetModel(modelKey) + model, err := m.store.models.Get(modelKey) if err != nil { return false, fmt.Errorf("failed to find model %s: %w", modelKey, err) } @@ -204,7 +204,7 @@ func (m *MemoryStore) UnloadModelGwVersionModels(modelKey string, version uint32 m.setModelGwStatusToTerminate(false, modelVersion) - if err := m.store.UpdateModel(model); err != nil { + if err := m.store.models.Update(model); err != nil { return false, fmt.Errorf("failed to update model %s: %w", modelKey, err) } diff --git a/scheduler/pkg/store/mock/store.go b/scheduler/pkg/store/mock/store.go index d6091f1c84..f73802a318 100644 --- a/scheduler/pkg/store/mock/store.go +++ b/scheduler/pkg/store/mock/store.go @@ -1,18 +1,9 @@ -/* -Copyright (c) 2024 Seldon Technologies Ltd. - -Use of this software is governed BY -(1) the license included in the LICENSE file or -(2) if the license included in the LICENSE file is the Business Source License 1.1, -the Change License after the Change Date as each is defined in accordance with the LICENSE file. -*/ - // Code generated by MockGen. DO NOT EDIT. -// Source: ./store.go +// Source: ./api.go // // Generated by this command: // -// mockgen -source=./store.go -destination=./mock/store.go -package=mock ModelStore +// mockgen -source=./api.go -destination=./mock/store.go -package=mock ModelServerAPI // // Package mock is a generated GoMock package. @@ -23,35 +14,36 @@ import ( agent "github.com/seldonio/seldon-core/apis/go/v2/mlops/agent" scheduler "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + db "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" store "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" gomock "go.uber.org/mock/gomock" ) -// MockModelStore is a mock of ModelStore interface. -type MockModelStore struct { +// MockModelServerAPI is a mock of ModelServerAPI interface. +type MockModelServerAPI struct { ctrl *gomock.Controller - recorder *MockModelStoreMockRecorder + recorder *MockModelServerAPIMockRecorder } -// MockModelStoreMockRecorder is the mock recorder for MockModelStore. -type MockModelStoreMockRecorder struct { - mock *MockModelStore +// MockModelServerAPIMockRecorder is the mock recorder for MockModelServerAPI. +type MockModelServerAPIMockRecorder struct { + mock *MockModelServerAPI } -// NewMockModelStore creates a new mock instance. -func NewMockModelStore(ctrl *gomock.Controller) *MockModelStore { - mock := &MockModelStore{ctrl: ctrl} - mock.recorder = &MockModelStoreMockRecorder{mock} +// NewMockModelServerAPI creates a new mock instance. +func NewMockModelServerAPI(ctrl *gomock.Controller) *MockModelServerAPI { + mock := &MockModelServerAPI{ctrl: ctrl} + mock.recorder = &MockModelServerAPIMockRecorder{mock} return mock } // EXPECT returns an object that allows the caller to indicate expected use. -func (m *MockModelStore) EXPECT() *MockModelStoreMockRecorder { +func (m *MockModelServerAPI) EXPECT() *MockModelServerAPIMockRecorder { return m.recorder } // AddServerReplica mocks base method. -func (m *MockModelStore) AddServerReplica(request *agent.AgentSubscribeRequest) error { +func (m *MockModelServerAPI) AddServerReplica(request *agent.AgentSubscribeRequest) error { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "AddServerReplica", request) ret0, _ := ret[0].(error) @@ -59,13 +51,13 @@ func (m *MockModelStore) AddServerReplica(request *agent.AgentSubscribeRequest) } // AddServerReplica indicates an expected call of AddServerReplica. -func (mr *MockModelStoreMockRecorder) AddServerReplica(request any) *gomock.Call { +func (mr *MockModelServerAPIMockRecorder) AddServerReplica(request any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AddServerReplica", reflect.TypeOf((*MockModelStore)(nil).AddServerReplica), request) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "AddServerReplica", reflect.TypeOf((*MockModelServerAPI)(nil).AddServerReplica), request) } // DrainServerReplica mocks base method. -func (m *MockModelStore) DrainServerReplica(serverName string, replicaIdx int) ([]string, error) { +func (m *MockModelServerAPI) DrainServerReplica(serverName string, replicaIdx int) ([]string, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "DrainServerReplica", serverName, replicaIdx) ret0, _ := ret[0].([]string) @@ -74,13 +66,13 @@ func (m *MockModelStore) DrainServerReplica(serverName string, replicaIdx int) ( } // DrainServerReplica indicates an expected call of DrainServerReplica. -func (mr *MockModelStoreMockRecorder) DrainServerReplica(serverName, replicaIdx any) *gomock.Call { +func (mr *MockModelServerAPIMockRecorder) DrainServerReplica(serverName, replicaIdx any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "DrainServerReplica", reflect.TypeOf((*MockModelStore)(nil).DrainServerReplica), serverName, replicaIdx) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "DrainServerReplica", reflect.TypeOf((*MockModelServerAPI)(nil).DrainServerReplica), serverName, replicaIdx) } // EmitEvents mocks base method. -func (m *MockModelStore) EmitEvents() error { +func (m *MockModelServerAPI) EmitEvents() error { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "EmitEvents") ret0, _ := ret[0].(error) @@ -88,13 +80,13 @@ func (m *MockModelStore) EmitEvents() error { } // EmitEvents indicates an expected call of EmitEvents. -func (mr *MockModelStoreMockRecorder) EmitEvents() *gomock.Call { +func (mr *MockModelServerAPIMockRecorder) EmitEvents() *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "EmitEvents", reflect.TypeOf((*MockModelStore)(nil).EmitEvents)) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "EmitEvents", reflect.TypeOf((*MockModelServerAPI)(nil).EmitEvents)) } // FailedScheduling mocks base method. -func (m *MockModelStore) FailedScheduling(modelID string, version uint32, reason string, reset bool) error { +func (m *MockModelServerAPI) FailedScheduling(modelID string, version uint32, reason string, reset bool) error { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "FailedScheduling", modelID, version, reason, reset) ret0, _ := ret[0].(error) @@ -102,13 +94,13 @@ func (m *MockModelStore) FailedScheduling(modelID string, version uint32, reason } // FailedScheduling indicates an expected call of FailedScheduling. -func (mr *MockModelStoreMockRecorder) FailedScheduling(modelID, version, reason, reset any) *gomock.Call { +func (mr *MockModelServerAPIMockRecorder) FailedScheduling(modelID, version, reason, reset any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FailedScheduling", reflect.TypeOf((*MockModelStore)(nil).FailedScheduling), modelID, version, reason, reset) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FailedScheduling", reflect.TypeOf((*MockModelServerAPI)(nil).FailedScheduling), modelID, version, reason, reset) } // GetAllModels mocks base method. -func (m *MockModelStore) GetAllModels() ([]string, error) { +func (m *MockModelServerAPI) GetAllModels() ([]string, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "GetAllModels") ret0, _ := ret[0].([]string) @@ -117,85 +109,86 @@ func (m *MockModelStore) GetAllModels() ([]string, error) { } // GetAllModels indicates an expected call of GetAllModels. -func (mr *MockModelStoreMockRecorder) GetAllModels() *gomock.Call { +func (mr *MockModelServerAPIMockRecorder) GetAllModels() *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetAllModels", reflect.TypeOf((*MockModelStore)(nil).GetAllModels)) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetAllModels", reflect.TypeOf((*MockModelServerAPI)(nil).GetAllModels)) } // GetModel mocks base method. -func (m *MockModelStore) GetModel(key string) (*store.ModelSnapshot, error) { +func (m *MockModelServerAPI) GetModel(key string) (*db.Model, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "GetModel", key) - ret0, _ := ret[0].(*store.ModelSnapshot) + ret0, _ := ret[0].(*db.Model) ret1, _ := ret[1].(error) return ret0, ret1 } // GetModel indicates an expected call of GetModel. -func (mr *MockModelStoreMockRecorder) GetModel(key any) *gomock.Call { +func (mr *MockModelServerAPIMockRecorder) GetModel(key any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetModel", reflect.TypeOf((*MockModelStore)(nil).GetModel), key) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetModel", reflect.TypeOf((*MockModelServerAPI)(nil).GetModel), key) } // GetModels mocks base method. -func (m *MockModelStore) GetModels() ([]*store.ModelSnapshot, error) { +func (m *MockModelServerAPI) GetModels() ([]*db.Model, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "GetModels") - ret0, _ := ret[0].([]*store.ModelSnapshot) + ret0, _ := ret[0].([]*db.Model) ret1, _ := ret[1].(error) return ret0, ret1 } // GetModels indicates an expected call of GetModels. -func (mr *MockModelStoreMockRecorder) GetModels() *gomock.Call { +func (mr *MockModelServerAPIMockRecorder) GetModels() *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetModels", reflect.TypeOf((*MockModelStore)(nil).GetModels)) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetModels", reflect.TypeOf((*MockModelServerAPI)(nil).GetModels)) } // GetServer mocks base method. -func (m *MockModelStore) GetServer(serverKey string, shallow, modelDetails bool) (*store.ServerSnapshot, error) { +func (m *MockModelServerAPI) GetServer(serverKey string, shallow, modelDetails bool) (*db.Server, *store.ServerStats, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "GetServer", serverKey, shallow, modelDetails) - ret0, _ := ret[0].(*store.ServerSnapshot) - ret1, _ := ret[1].(error) - return ret0, ret1 + ret0, _ := ret[0].(*db.Server) + ret1, _ := ret[1].(*store.ServerStats) + ret2, _ := ret[2].(error) + return ret0, ret1, ret2 } // GetServer indicates an expected call of GetServer. -func (mr *MockModelStoreMockRecorder) GetServer(serverKey, shallow, modelDetails any) *gomock.Call { +func (mr *MockModelServerAPIMockRecorder) GetServer(serverKey, shallow, modelDetails any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetServer", reflect.TypeOf((*MockModelStore)(nil).GetServer), serverKey, shallow, modelDetails) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetServer", reflect.TypeOf((*MockModelServerAPI)(nil).GetServer), serverKey, shallow, modelDetails) } // GetServers mocks base method. -func (m *MockModelStore) GetServers(shallow, modelDetails bool) ([]*store.ServerSnapshot, error) { +func (m *MockModelServerAPI) GetServers(shallow, modelDetails bool) ([]*db.Server, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "GetServers", shallow, modelDetails) - ret0, _ := ret[0].([]*store.ServerSnapshot) + ret0, _ := ret[0].([]*db.Server) ret1, _ := ret[1].(error) return ret0, ret1 } // GetServers indicates an expected call of GetServers. -func (mr *MockModelStoreMockRecorder) GetServers(shallow, modelDetails any) *gomock.Call { +func (mr *MockModelServerAPIMockRecorder) GetServers(shallow, modelDetails any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetServers", reflect.TypeOf((*MockModelStore)(nil).GetServers), shallow, modelDetails) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetServers", reflect.TypeOf((*MockModelServerAPI)(nil).GetServers), shallow, modelDetails) } // LockModel mocks base method. -func (m *MockModelStore) LockModel(modelId string) { +func (m *MockModelServerAPI) LockModel(modelId string) { m.ctrl.T.Helper() m.ctrl.Call(m, "LockModel", modelId) } // LockModel indicates an expected call of LockModel. -func (mr *MockModelStoreMockRecorder) LockModel(modelId any) *gomock.Call { +func (mr *MockModelServerAPIMockRecorder) LockModel(modelId any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "LockModel", reflect.TypeOf((*MockModelStore)(nil).LockModel), modelId) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "LockModel", reflect.TypeOf((*MockModelServerAPI)(nil).LockModel), modelId) } // RemoveModel mocks base method. -func (m *MockModelStore) RemoveModel(req *scheduler.UnloadModelRequest) error { +func (m *MockModelServerAPI) RemoveModel(req *scheduler.UnloadModelRequest) error { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "RemoveModel", req) ret0, _ := ret[0].(error) @@ -203,13 +196,13 @@ func (m *MockModelStore) RemoveModel(req *scheduler.UnloadModelRequest) error { } // RemoveModel indicates an expected call of RemoveModel. -func (mr *MockModelStoreMockRecorder) RemoveModel(req any) *gomock.Call { +func (mr *MockModelServerAPIMockRecorder) RemoveModel(req any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RemoveModel", reflect.TypeOf((*MockModelStore)(nil).RemoveModel), req) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RemoveModel", reflect.TypeOf((*MockModelServerAPI)(nil).RemoveModel), req) } // RemoveServerReplica mocks base method. -func (m *MockModelStore) RemoveServerReplica(serverName string, replicaIdx int) ([]string, error) { +func (m *MockModelServerAPI) RemoveServerReplica(serverName string, replicaIdx int) ([]string, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "RemoveServerReplica", serverName, replicaIdx) ret0, _ := ret[0].([]string) @@ -218,13 +211,13 @@ func (m *MockModelStore) RemoveServerReplica(serverName string, replicaIdx int) } // RemoveServerReplica indicates an expected call of RemoveServerReplica. -func (mr *MockModelStoreMockRecorder) RemoveServerReplica(serverName, replicaIdx any) *gomock.Call { +func (mr *MockModelServerAPIMockRecorder) RemoveServerReplica(serverName, replicaIdx any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RemoveServerReplica", reflect.TypeOf((*MockModelStore)(nil).RemoveServerReplica), serverName, replicaIdx) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RemoveServerReplica", reflect.TypeOf((*MockModelServerAPI)(nil).RemoveServerReplica), serverName, replicaIdx) } // ServerNotify mocks base method. -func (m *MockModelStore) ServerNotify(request *scheduler.ServerNotify) error { +func (m *MockModelServerAPI) ServerNotify(request *scheduler.ServerNotify) error { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "ServerNotify", request) ret0, _ := ret[0].(error) @@ -232,13 +225,13 @@ func (m *MockModelStore) ServerNotify(request *scheduler.ServerNotify) error { } // ServerNotify indicates an expected call of ServerNotify. -func (mr *MockModelStoreMockRecorder) ServerNotify(request any) *gomock.Call { +func (mr *MockModelServerAPIMockRecorder) ServerNotify(request any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ServerNotify", reflect.TypeOf((*MockModelStore)(nil).ServerNotify), request) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ServerNotify", reflect.TypeOf((*MockModelServerAPI)(nil).ServerNotify), request) } // SetModelGwModelState mocks base method. -func (m *MockModelStore) SetModelGwModelState(name string, versionNumber uint32, status store.ModelState, reason, source string) error { +func (m *MockModelServerAPI) SetModelGwModelState(name string, versionNumber uint32, status db.ModelState, reason, source string) error { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "SetModelGwModelState", name, versionNumber, status, reason, source) ret0, _ := ret[0].(error) @@ -246,13 +239,13 @@ func (m *MockModelStore) SetModelGwModelState(name string, versionNumber uint32, } // SetModelGwModelState indicates an expected call of SetModelGwModelState. -func (mr *MockModelStoreMockRecorder) SetModelGwModelState(name, versionNumber, status, reason, source any) *gomock.Call { +func (mr *MockModelServerAPIMockRecorder) SetModelGwModelState(name, versionNumber, status, reason, source any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetModelGwModelState", reflect.TypeOf((*MockModelStore)(nil).SetModelGwModelState), name, versionNumber, status, reason, source) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetModelGwModelState", reflect.TypeOf((*MockModelServerAPI)(nil).SetModelGwModelState), name, versionNumber, status, reason, source) } // UnloadModelGwVersionModels mocks base method. -func (m *MockModelStore) UnloadModelGwVersionModels(modelKey string, version uint32) (bool, error) { +func (m *MockModelServerAPI) UnloadModelGwVersionModels(modelKey string, version uint32) (bool, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "UnloadModelGwVersionModels", modelKey, version) ret0, _ := ret[0].(bool) @@ -261,13 +254,13 @@ func (m *MockModelStore) UnloadModelGwVersionModels(modelKey string, version uin } // UnloadModelGwVersionModels indicates an expected call of UnloadModelGwVersionModels. -func (mr *MockModelStoreMockRecorder) UnloadModelGwVersionModels(modelKey, version any) *gomock.Call { +func (mr *MockModelServerAPIMockRecorder) UnloadModelGwVersionModels(modelKey, version any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UnloadModelGwVersionModels", reflect.TypeOf((*MockModelStore)(nil).UnloadModelGwVersionModels), modelKey, version) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UnloadModelGwVersionModels", reflect.TypeOf((*MockModelServerAPI)(nil).UnloadModelGwVersionModels), modelKey, version) } // UnloadVersionModels mocks base method. -func (m *MockModelStore) UnloadVersionModels(modelKey string, version uint32) (bool, error) { +func (m *MockModelServerAPI) UnloadVersionModels(modelKey string, version uint32) (bool, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "UnloadVersionModels", modelKey, version) ret0, _ := ret[0].(bool) @@ -276,25 +269,25 @@ func (m *MockModelStore) UnloadVersionModels(modelKey string, version uint32) (b } // UnloadVersionModels indicates an expected call of UnloadVersionModels. -func (mr *MockModelStoreMockRecorder) UnloadVersionModels(modelKey, version any) *gomock.Call { +func (mr *MockModelServerAPIMockRecorder) UnloadVersionModels(modelKey, version any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UnloadVersionModels", reflect.TypeOf((*MockModelStore)(nil).UnloadVersionModels), modelKey, version) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UnloadVersionModels", reflect.TypeOf((*MockModelServerAPI)(nil).UnloadVersionModels), modelKey, version) } // UnlockModel mocks base method. -func (m *MockModelStore) UnlockModel(modelId string) { +func (m *MockModelServerAPI) UnlockModel(modelId string) { m.ctrl.T.Helper() m.ctrl.Call(m, "UnlockModel", modelId) } // UnlockModel indicates an expected call of UnlockModel. -func (mr *MockModelStoreMockRecorder) UnlockModel(modelId any) *gomock.Call { +func (mr *MockModelServerAPIMockRecorder) UnlockModel(modelId any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UnlockModel", reflect.TypeOf((*MockModelStore)(nil).UnlockModel), modelId) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UnlockModel", reflect.TypeOf((*MockModelServerAPI)(nil).UnlockModel), modelId) } // UpdateLoadedModels mocks base method. -func (m *MockModelStore) UpdateLoadedModels(modelKey string, version uint32, serverKey string, replicas []*store.ServerReplica) error { +func (m *MockModelServerAPI) UpdateLoadedModels(modelKey string, version uint32, serverKey string, replicas []*db.ServerReplica) error { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "UpdateLoadedModels", modelKey, version, serverKey, replicas) ret0, _ := ret[0].(error) @@ -302,13 +295,13 @@ func (m *MockModelStore) UpdateLoadedModels(modelKey string, version uint32, ser } // UpdateLoadedModels indicates an expected call of UpdateLoadedModels. -func (mr *MockModelStoreMockRecorder) UpdateLoadedModels(modelKey, version, serverKey, replicas any) *gomock.Call { +func (mr *MockModelServerAPIMockRecorder) UpdateLoadedModels(modelKey, version, serverKey, replicas any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateLoadedModels", reflect.TypeOf((*MockModelStore)(nil).UpdateLoadedModels), modelKey, version, serverKey, replicas) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateLoadedModels", reflect.TypeOf((*MockModelServerAPI)(nil).UpdateLoadedModels), modelKey, version, serverKey, replicas) } // UpdateModel mocks base method. -func (m *MockModelStore) UpdateModel(config *scheduler.LoadModelRequest) error { +func (m *MockModelServerAPI) UpdateModel(config *scheduler.LoadModelRequest) error { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "UpdateModel", config) ret0, _ := ret[0].(error) @@ -316,13 +309,13 @@ func (m *MockModelStore) UpdateModel(config *scheduler.LoadModelRequest) error { } // UpdateModel indicates an expected call of UpdateModel. -func (mr *MockModelStoreMockRecorder) UpdateModel(config any) *gomock.Call { +func (mr *MockModelServerAPIMockRecorder) UpdateModel(config any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateModel", reflect.TypeOf((*MockModelStore)(nil).UpdateModel), config) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateModel", reflect.TypeOf((*MockModelServerAPI)(nil).UpdateModel), config) } // UpdateModelState mocks base method. -func (m *MockModelStore) UpdateModelState(modelKey string, version uint32, serverKey string, replicaIdx int, availableMemory *uint64, expectedState, desiredState store.ModelReplicaState, reason string, runtimeInfo *scheduler.ModelRuntimeInfo) error { +func (m *MockModelServerAPI) UpdateModelState(modelKey string, version uint32, serverKey string, replicaIdx int, availableMemory *uint64, expectedState, desiredState db.ModelReplicaState, reason string, runtimeInfo *scheduler.ModelRuntimeInfo) error { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "UpdateModelState", modelKey, version, serverKey, replicaIdx, availableMemory, expectedState, desiredState, reason, runtimeInfo) ret0, _ := ret[0].(error) @@ -330,7 +323,7 @@ func (m *MockModelStore) UpdateModelState(modelKey string, version uint32, serve } // UpdateModelState indicates an expected call of UpdateModelState. -func (mr *MockModelStoreMockRecorder) UpdateModelState(modelKey, version, serverKey, replicaIdx, availableMemory, expectedState, desiredState, reason, runtimeInfo any) *gomock.Call { +func (mr *MockModelServerAPIMockRecorder) UpdateModelState(modelKey, version, serverKey, replicaIdx, availableMemory, expectedState, desiredState, reason, runtimeInfo any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateModelState", reflect.TypeOf((*MockModelStore)(nil).UpdateModelState), modelKey, version, serverKey, replicaIdx, availableMemory, expectedState, desiredState, reason, runtimeInfo) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateModelState", reflect.TypeOf((*MockModelServerAPI)(nil).UpdateModelState), modelKey, version, serverKey, replicaIdx, availableMemory, expectedState, desiredState, reason, runtimeInfo) } diff --git a/scheduler/pkg/store/pipeline/db.go b/scheduler/pkg/store/pipeline/db.go index 0bdbaa8f9f..72364b7cc7 100644 --- a/scheduler/pkg/store/pipeline/db.go +++ b/scheduler/pkg/store/pipeline/db.go @@ -16,7 +16,7 @@ import ( "github.com/sirupsen/logrus" "google.golang.org/protobuf/proto" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/utils" ) @@ -111,7 +111,7 @@ func (pdb *PipelineDBManager) restore(createPipelineCb func(pipeline *Pipeline)) continue } err := item.Value(func(v []byte) error { - snapshot := scheduler.PipelineSnapshot{} + snapshot := db.PipelineSnapshot{} err := proto.Unmarshal(v, &snapshot) if err != nil { return err @@ -146,7 +146,7 @@ func (pdb *PipelineDBManager) get(name string) (*Pipeline, error) { return err } return item.Value(func(v []byte) error { - snapshot := scheduler.PipelineSnapshot{} + snapshot := db.PipelineSnapshot{} err = proto.Unmarshal(v, &snapshot) if err != nil { return err diff --git a/scheduler/pkg/store/pipeline/status.go b/scheduler/pkg/store/pipeline/status.go index 784ac31b8a..3ef4488b4a 100644 --- a/scheduler/pkg/store/pipeline/status.go +++ b/scheduler/pkg/store/pipeline/status.go @@ -27,7 +27,7 @@ var member void type ModelStatusHandler struct { mu sync.RWMutex logger logrus.FieldLogger - store store.ModelStore + store store.ModelServerAPI modelReferences map[string]map[string]void } diff --git a/scheduler/pkg/store/pipeline/status_test.go b/scheduler/pkg/store/pipeline/status_test.go index 420acee979..801aad0336 100644 --- a/scheduler/pkg/store/pipeline/status_test.go +++ b/scheduler/pkg/store/pipeline/status_test.go @@ -25,7 +25,7 @@ type fakeModelStore struct { status map[string]store.ModelState } -var _ store.ModelStore = (*fakeModelStore)(nil) +var _ store.ModelServerAPI = (*fakeModelStore)(nil) func (f fakeModelStore) UpdateModel(config *scheduler.LoadModelRequest) error { panic("implement me") diff --git a/scheduler/pkg/store/pipeline/store.go b/scheduler/pkg/store/pipeline/store.go index cebf2557b7..b234a25aeb 100644 --- a/scheduler/pkg/store/pipeline/store.go +++ b/scheduler/pkg/store/pipeline/store.go @@ -62,7 +62,7 @@ type PipelineStore struct { modelStatusHandler ModelStatusHandler } -func NewPipelineStore(logger logrus.FieldLogger, eventHub *coordinator.EventHub, store store.ModelStore) *PipelineStore { +func NewPipelineStore(logger logrus.FieldLogger, eventHub *coordinator.EventHub, store store.ModelServerAPI) *PipelineStore { ps := &PipelineStore{ logger: logger.WithField("source", "pipelineStore"), eventHub: eventHub, diff --git a/scheduler/pkg/store/test_memory_hack.go b/scheduler/pkg/store/test_memory_hack.go index 3a3921120e..f78f9b37d9 100644 --- a/scheduler/pkg/store/test_memory_hack.go +++ b/scheduler/pkg/store/test_memory_hack.go @@ -11,8 +11,10 @@ package store import ( "errors" + "fmt" "testing" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" log "github.com/sirupsen/logrus" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" @@ -34,22 +36,23 @@ type ModelID struct { func NewTestMemory( t *testing.T, logger log.FieldLogger, - store *LocalSchedulerStore, + modelStore Storage[*db.Model], + serverStore Storage[*db.Server], eventHub *coordinator.EventHub) *TestMemoryStore { if t == nil { panic("testing.T is required, must only be run via tests") } - m := NewMemoryStore(logger, store, eventHub) + m := NewMemoryStore(logger, modelStore, serverStore, eventHub) return &TestMemoryStore{m} } -func (t *TestMemoryStore) DirectlyUpdateModelStatus(model ModelID, state ModelStatus) error { +func (t *TestMemoryStore) DirectlyUpdateModelStatus(model ModelID, state *db.ModelStatus) error { t.mu.Lock() defer t.mu.Unlock() - found, ok := t.store.models[model.Name] - if !ok { - return errors.New("model not found") + found, err := t.store.models.Get(model.Name) + if err != nil { + return fmt.Errorf("model not found: %w", err) } version := found.GetVersion(model.Version) @@ -57,6 +60,6 @@ func (t *TestMemoryStore) DirectlyUpdateModelStatus(model ModelID, state ModelSt return errors.New("version not found") } - version.state = state - return nil + version.State = state + return t.store.models.Update(found) } From ae2692e3c2b67662c696898ebff92e97954e6293 Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Fri, 2 Jan 2026 17:46:25 +0000 Subject: [PATCH 08/50] wip: adding DB save points --- scheduler/pkg/store/memory.go | 105 +++++++++++------------- scheduler/pkg/store/memory_status.go | 12 ++- scheduler/pkg/store/test_memory_hack.go | 4 +- 3 files changed, 60 insertions(+), 61 deletions(-) diff --git a/scheduler/pkg/store/memory.go b/scheduler/pkg/store/memory.go index 49772667eb..5cf4920bde 100644 --- a/scheduler/pkg/store/memory.go +++ b/scheduler/pkg/store/memory.go @@ -25,7 +25,7 @@ import ( "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/utils" ) -type MemoryStore struct { +type ModelServerStore struct { mu sync.RWMutex opLocks sync.Map store struct { @@ -36,25 +36,25 @@ type MemoryStore struct { eventHub *coordinator.EventHub } -var _ ModelServerAPI = &MemoryStore{} +var _ ModelServerAPI = &ModelServerStore{} func NewMemoryStore( logger log.FieldLogger, modelStore Storage[*db.Model], serverStore Storage[*db.Server], eventHub *coordinator.EventHub, -) *MemoryStore { - return &MemoryStore{ +) *ModelServerStore { + return &ModelServerStore{ store: struct { models Storage[*db.Model] servers Storage[*db.Server] }{models: modelStore, servers: serverStore}, - logger: logger.WithField("source", "MemoryStore"), + logger: logger.WithField("source", "ModelServerStore"), eventHub: eventHub, } } -func (m *MemoryStore) GetAllModels() ([]string, error) { +func (m *ModelServerStore) GetAllModels() ([]string, error) { m.mu.RLock() defer m.mu.RUnlock() var modelNames []string @@ -70,13 +70,13 @@ func (m *MemoryStore) GetAllModels() ([]string, error) { return modelNames, nil } -func (m *MemoryStore) GetModels() ([]*db.Model, error) { +func (m *ModelServerStore) GetModels() ([]*db.Model, error) { m.mu.RLock() defer m.mu.RUnlock() return m.store.models.List() } -func (m *MemoryStore) addModelVersionIfNotExists(req *agent.ModelVersion) (*db.Model, *db.ModelVersion, error) { +func (m *ModelServerStore) addModelVersionIfNotExists(req *agent.ModelVersion) (*db.Model, *db.ModelVersion, error) { modelName := req.GetModel().GetMeta().GetName() model, err := m.store.models.Get(modelName) if err != nil { @@ -102,7 +102,7 @@ func (m *MemoryStore) addModelVersionIfNotExists(req *agent.ModelVersion) (*db.M // TODO pointer save issue } -func (m *MemoryStore) addNextModelVersion(model *db.Model, pbmodel *pb.Model) { +func (m *ModelServerStore) addNextModelVersion(model *db.Model, pbmodel *pb.Model) { // if we start from a clean state, lets use the generation id as the starting version // this is to ensure that we have monotonic increasing version numbers // and we never reset back to 1 @@ -119,7 +119,7 @@ func (m *MemoryStore) addNextModelVersion(model *db.Model, pbmodel *pb.Model) { }) } -func (m *MemoryStore) UpdateModel(req *pb.LoadModelRequest) error { +func (m *ModelServerStore) UpdateModel(req *pb.LoadModelRequest) error { m.mu.Lock() defer m.mu.Unlock() @@ -166,38 +166,32 @@ func (m *MemoryStore) UpdateModel(req *pb.LoadModelRequest) error { } meq := ModelEqualityCheck(latestModel.ModelDefn, req.GetModel()) - changed := false switch { case meq.Equal: logger.Debugf("Model %s semantically equal - doing nothing", modelName) + return nil case meq.ModelSpecDiffers: logger.Debugf("Model %s model spec differs - adding new version of model", modelName) m.addNextModelVersion(model, req.GetModel()) - changed = true case meq.DeploymentSpecDiffers: logger.Debugf( "Model %s deployment spec differs - updating latest model version with new spec", modelName, ) latestModel.ModelDefn.DeploymentSpec = req.GetModel().GetDeploymentSpec() - changed = true case meq.MetaDiffers: // Update just kubernetes meta latestModel.ModelDefn.Meta.KubernetesMeta = req.GetModel().GetMeta().GetKubernetesMeta() - changed = true } - if changed { - if err := m.store.models.Update(model); err != nil { - return fmt.Errorf("failed to update model %s: %w", modelName, err) - } + if err := m.store.models.Update(model); err != nil { + return fmt.Errorf("failed to update model %s: %w", modelName, err) } - return nil } -func (m *MemoryStore) deepCopy(model *Model, key string) *ModelSnapshot { +func (m *ModelServerStore) deepCopy(model *Model, key string) *ModelSnapshot { snapshot := &ModelSnapshot{ Name: key, Deleted: model.IsDeleted(), @@ -210,13 +204,13 @@ func (m *MemoryStore) deepCopy(model *Model, key string) *ModelSnapshot { return snapshot } -func (m *MemoryStore) LockModel(modelId string) { +func (m *ModelServerStore) LockModel(modelId string) { var lock sync.RWMutex existingLock, _ := m.opLocks.LoadOrStore(modelId, &lock) existingLock.(*sync.RWMutex).Lock() } -func (m *MemoryStore) UnlockModel(modelId string) { +func (m *ModelServerStore) UnlockModel(modelId string) { logger := m.logger.WithField("func", "UnlockModel") lock, loaded := m.opLocks.Load(modelId) if loaded { @@ -226,7 +220,7 @@ func (m *MemoryStore) UnlockModel(modelId string) { } } -func (m *MemoryStore) GetModel(key string) (*db.Model, error) { +func (m *ModelServerStore) GetModel(key string) (*db.Model, error) { m.mu.RLock() defer m.mu.RUnlock() model, err := m.store.models.Get(key) @@ -236,7 +230,7 @@ func (m *MemoryStore) GetModel(key string) (*db.Model, error) { return model, nil } -func (m *MemoryStore) RemoveModel(req *pb.UnloadModelRequest) error { +func (m *ModelServerStore) RemoveModel(req *pb.UnloadModelRequest) error { err := m.removeModelImpl(req) if err != nil { return err @@ -244,7 +238,7 @@ func (m *MemoryStore) RemoveModel(req *pb.UnloadModelRequest) error { return nil } -func (m *MemoryStore) removeModelImpl(req *pb.UnloadModelRequest) error { +func (m *ModelServerStore) removeModelImpl(req *pb.UnloadModelRequest) error { m.mu.Lock() defer m.mu.Unlock() @@ -259,8 +253,7 @@ func (m *MemoryStore) removeModelImpl(req *pb.UnloadModelRequest) error { model.Deleted = true m.setModelGwStatusToTerminate(true, model.Latest()) - m.updateModelStatus(true, true, model.Latest(), model.GetLastAvailableModelVersion()) - return nil + return m.updateModelStatus(true, true, model.Latest(), model.GetLastAvailableModelVersion(), model) } if errors.Is(err, ErrNotFound) { @@ -269,7 +262,7 @@ func (m *MemoryStore) removeModelImpl(req *pb.UnloadModelRequest) error { return err } -func (m *MemoryStore) GetServers(shallow bool, modelDetails bool) ([]*db.Server, error) { +func (m *ModelServerStore) GetServers(shallow bool, modelDetails bool) ([]*db.Server, error) { m.mu.RLock() defer m.mu.RUnlock() @@ -281,7 +274,7 @@ func (m *MemoryStore) GetServers(shallow bool, modelDetails bool) ([]*db.Server, return servers, nil } -func (m *MemoryStore) GetServer(serverKey string, shallow bool, modelDetails bool) (*db.Server, *ServerStats, error) { +func (m *ModelServerStore) GetServer(serverKey string, _ bool, modelDetails bool) (*db.Server, *ServerStats, error) { m.mu.RLock() defer m.mu.RUnlock() @@ -305,7 +298,7 @@ func (m *MemoryStore) GetServer(serverKey string, shallow bool, modelDetails boo return server, nil, nil } -func (m *MemoryStore) getServerStats(serverKey string) (*ServerStats, error) { +func (m *ModelServerStore) getServerStats(serverKey string) (*ServerStats, error) { emptyReplicas, err := m.numEmptyServerReplicas(serverKey) if err != nil { return nil, err @@ -322,7 +315,7 @@ func (m *MemoryStore) getServerStats(serverKey string) (*ServerStats, error) { }, nil } -func (m *MemoryStore) getModelServer( +func (m *ModelServerStore) getModelServer( modelKey string, version uint32, serverKey string, @@ -343,7 +336,7 @@ func (m *MemoryStore) getModelServer( return model, modelVersion, server, nil } -func (m *MemoryStore) UpdateLoadedModels( +func (m *ModelServerStore) UpdateLoadedModels( modelKey string, version uint32, serverKey string, @@ -364,7 +357,7 @@ func (m *MemoryStore) UpdateLoadedModels( return nil } -func (m *MemoryStore) updateLoadedModelsImpl( +func (m *ModelServerStore) updateLoadedModelsImpl( modelKey string, version uint32, serverKey string, @@ -373,7 +366,6 @@ func (m *MemoryStore) updateLoadedModelsImpl( logger := m.logger.WithField("func", "updateLoadedModelsImpl") // Validate - // TODO pointer issue model, err := m.store.models.Get(modelKey) if err != nil { if errors.Is(err, ErrNotFound) { @@ -403,7 +395,7 @@ func (m *MemoryStore) updateLoadedModelsImpl( server, err := m.store.servers.Get(serverKey) if err != nil { - return nil, fmt.Errorf("failed to find server %s", serverKey) + return nil, fmt.Errorf("failed to find server %s: %w", serverKey, err) } assignedReplicaIds := make(map[int32]struct{}) @@ -417,6 +409,7 @@ func (m *MemoryStore) updateLoadedModelsImpl( assignedReplicaIds[replica.ReplicaIdx] = struct{}{} } + // TODO shouldn't we be updating the store here??? for modelVersion.HasServer() && modelVersion.Server != serverKey { logger.Debugf("Adding new version as server changed to %s from %s", modelVersion.Server, serverKey) m.addNextModelVersion(model, model.Latest().ModelDefn) @@ -476,7 +469,9 @@ func (m *MemoryStore) updateLoadedModelsImpl( (modelVersion.State.State == db.ModelState_MODEL_STATE_AVAILABLE && len(modelVersion.GetAssignment()) < modelVersion.DesiredReplicas()) { logger.Debugf("Updating model status for model %s server %s", modelKey, serverKey) modelVersion.Server = serverKey - m.updateModelStatus(true, model.Deleted, modelVersion, model.GetLastAvailableModelVersion()) + if err := m.updateModelStatus(true, model.Deleted, modelVersion, model.GetLastAvailableModelVersion(), model); err != nil { + return nil, fmt.Errorf("failed to update model %s: %w", modelKey, err) + } return &coordinator.ModelEventMsg{ ModelName: modelVersion.ModelDefn.Meta.Name, @@ -489,7 +484,7 @@ func (m *MemoryStore) updateLoadedModelsImpl( return nil, nil } -func (m *MemoryStore) UnloadVersionModels(modelKey string, version uint32) (bool, error) { +func (m *ModelServerStore) UnloadVersionModels(modelKey string, version uint32) (bool, error) { evt, updated, err := m.unloadVersionModelsImpl(modelKey, version) if err != nil { return updated, err @@ -503,7 +498,7 @@ func (m *MemoryStore) UnloadVersionModels(modelKey string, version uint32) (bool return updated, nil } -func (m *MemoryStore) unloadVersionModelsImpl(modelKey string, version uint32) (*coordinator.ModelEventMsg, bool, error) { +func (m *ModelServerStore) unloadVersionModelsImpl(modelKey string, version uint32) (*coordinator.ModelEventMsg, bool, error) { logger := m.logger.WithField("func", "UnloadVersionModels") m.mu.Lock() defer m.mu.Unlock() @@ -551,7 +546,7 @@ func (m *MemoryStore) unloadVersionModelsImpl(modelKey string, version uint32) ( return nil, false, nil } -func (m *MemoryStore) UpdateModelState( +func (m *ModelServerStore) UpdateModelState( modelKey string, version uint32, serverKey string, @@ -581,7 +576,7 @@ func (m *MemoryStore) UpdateModelState( return nil } -func (m *MemoryStore) updateModelStateImpl( +func (m *ModelServerStore) updateModelStateImpl( modelKey string, version uint32, serverKey string, @@ -681,7 +676,7 @@ func (m *MemoryStore) updateModelStateImpl( return nil, nil, nil } -func (m *MemoryStore) updateReservedMemory( +func (m *ModelServerStore) updateReservedMemory( modelReplicaState db.ModelReplicaState, serverKey string, replicaIdx int, memBytes uint64, ) { // update reserved memory that is being used for sorting replicas @@ -701,7 +696,7 @@ func (m *MemoryStore) updateReservedMemory( } } -func (m *MemoryStore) AddServerReplica(request *agent.AgentSubscribeRequest) error { +func (m *ModelServerStore) AddServerReplica(request *agent.AgentSubscribeRequest) error { evts, serverEvt, err := m.addServerReplicaImpl(request) if err != nil { return err @@ -722,7 +717,7 @@ func (m *MemoryStore) AddServerReplica(request *agent.AgentSubscribeRequest) err return nil } -func (m *MemoryStore) addServerReplicaImpl(request *agent.AgentSubscribeRequest) ([]coordinator.ModelEventMsg, coordinator.ServerEventMsg, error) { +func (m *ModelServerStore) addServerReplicaImpl(request *agent.AgentSubscribeRequest) ([]coordinator.ModelEventMsg, coordinator.ServerEventMsg, error) { m.mu.Lock() defer m.mu.Unlock() @@ -774,7 +769,7 @@ func (m *MemoryStore) addServerReplicaImpl(request *agent.AgentSubscribeRequest) return evts, serverEvt, nil } -func (m *MemoryStore) RemoveServerReplica(serverName string, replicaIdx int) ([]string, error) { +func (m *ModelServerStore) RemoveServerReplica(serverName string, replicaIdx int) ([]string, error) { models, evts, err := m.removeServerReplicaImpl(serverName, replicaIdx) if err != nil { return nil, err @@ -790,7 +785,7 @@ func (m *MemoryStore) RemoveServerReplica(serverName string, replicaIdx int) ([] return models, nil } -func (m *MemoryStore) removeServerReplicaImpl(serverName string, replicaIdx int) ([]string, []coordinator.ModelEventMsg, error) { +func (m *ModelServerStore) removeServerReplicaImpl(serverName string, replicaIdx int) ([]string, []coordinator.ModelEventMsg, error) { m.mu.Lock() defer m.mu.Unlock() @@ -818,7 +813,7 @@ func (m *MemoryStore) removeServerReplicaImpl(serverName string, replicaIdx int) return modelsRemoved, evts, nil } -func (m *MemoryStore) removeModelfromServerReplica(models []*db.ModelVersionID, replicaIdx int) ([]string, []coordinator.ModelEventMsg) { +func (m *ModelServerStore) removeModelfromServerReplica(models []*db.ModelVersionID, replicaIdx int) ([]string, []coordinator.ModelEventMsg) { logger := m.logger.WithField("func", "RemoveServerReplica") var modelNames []string var evts []coordinator.ModelEventMsg @@ -863,14 +858,14 @@ func (m *MemoryStore) removeModelfromServerReplica(models []*db.ModelVersionID, return modelNames, evts } -func (m *MemoryStore) DrainServerReplica(serverName string, replicaIdx int) ([]string, error) { +func (m *ModelServerStore) DrainServerReplica(serverName string, replicaIdx int) ([]string, error) { m.mu.Lock() defer m.mu.Unlock() return m.drainServerReplicaImpl(serverName, replicaIdx) } -func (m *MemoryStore) drainServerReplicaImpl(serverName string, replicaIdx int) ([]string, error) { +func (m *ModelServerStore) drainServerReplicaImpl(serverName string, replicaIdx int) ([]string, error) { logger := m.logger.WithField("func", "DrainServerReplica") // TODO pointer issue server, err := m.store.servers.Get(serverName) @@ -897,7 +892,7 @@ func (m *MemoryStore) drainServerReplicaImpl(serverName string, replicaIdx int) return append(loadedModels, loadingModels...), nil } -func (m *MemoryStore) findModelsToReSchedule(models []*db.ModelVersionID, replicaIdx int) []string { +func (m *ModelServerStore) findModelsToReSchedule(models []*db.ModelVersionID, replicaIdx int) []string { logger := m.logger.WithField("func", "DrainServerReplica") modelsReSchedule := make([]string, 0) @@ -919,7 +914,7 @@ func (m *MemoryStore) findModelsToReSchedule(models []*db.ModelVersionID, replic return modelsReSchedule } -func (m *MemoryStore) ServerNotify(request *pb.ServerNotify) error { +func (m *ModelServerStore) ServerNotify(request *pb.ServerNotify) error { logger := m.logger.WithField("func", "MemoryServerNotify") m.mu.Lock() defer m.mu.Unlock() @@ -954,7 +949,7 @@ func (m *MemoryStore) ServerNotify(request *pb.ServerNotify) error { return nil } -func (m *MemoryStore) numEmptyServerReplicas(serverName string) (uint32, error) { +func (m *ModelServerStore) numEmptyServerReplicas(serverName string) (uint32, error) { emptyReplicas := uint32(0) server, err := m.store.servers.Get(serverName) if err != nil { @@ -971,7 +966,7 @@ func (m *MemoryStore) numEmptyServerReplicas(serverName string) (uint32, error) return emptyReplicas, nil } -func (m *MemoryStore) maxNumModelReplicasForServer(serverName string) (uint32, error) { +func (m *ModelServerStore) maxNumModelReplicasForServer(serverName string) (uint32, error) { models, err := m.store.models.List() if err != nil { return 0, err @@ -998,7 +993,7 @@ func toSchedulerLoadedModels(agentLoadedModels []*agent.ModelVersion) []*db.Mode return loadedModels } -func (m *MemoryStore) SetModelGwModelState(name string, versionNumber uint32, status db.ModelState, reason string, source string) error { +func (m *ModelServerStore) SetModelGwModelState(name string, versionNumber uint32, status db.ModelState, reason string, source string) error { logger := m.logger.WithField("func", "SetModelGwModelState") logger.Debugf("Attempt to set model-gw state on model %s:%d status:%s", name, versionNumber, status.String()) @@ -1016,7 +1011,7 @@ func (m *MemoryStore) SetModelGwModelState(name string, versionNumber uint32, st return nil } -func (m *MemoryStore) setModelGwModelStateImpl(name string, versionNumber uint32, status db.ModelState, reason, source string) ([]*coordinator.ModelEventMsg, error) { +func (m *ModelServerStore) setModelGwModelStateImpl(name string, versionNumber uint32, status db.ModelState, reason, source string) ([]*coordinator.ModelEventMsg, error) { var evts []*coordinator.ModelEventMsg m.mu.Lock() @@ -1045,7 +1040,7 @@ func (m *MemoryStore) setModelGwModelStateImpl(name string, versionNumber uint32 return evts, nil } -func (m *MemoryStore) EmitEvents() error { +func (m *ModelServerStore) EmitEvents() error { servers, err := m.store.servers.List() if err != nil { return err diff --git a/scheduler/pkg/store/memory_status.go b/scheduler/pkg/store/memory_status.go index b0e17a2e0a..50d9304b4a 100644 --- a/scheduler/pkg/store/memory_status.go +++ b/scheduler/pkg/store/memory_status.go @@ -121,7 +121,7 @@ func updateModelState(isLatest bool, modelVersion *db.ModelVersion, prevModelVer } } -func (m *MemoryStore) FailedScheduling(modelID string, version uint32, reason string, reset bool) error { +func (m *ModelServerStore) FailedScheduling(modelID string, version uint32, reason string, reset bool) error { m.mu.Lock() defer m.mu.Unlock() @@ -170,15 +170,19 @@ func (m *MemoryStore) FailedScheduling(modelID string, version uint32, reason st return fmt.Errorf("model %s found, version %d not found", modelID, version) } -func (m *MemoryStore) updateModelStatus(isLatest bool, deleted bool, modelVersion *db.ModelVersion, prevModelVersion *db.ModelVersion) { +func (m *ModelServerStore) updateModelStatus(isLatest bool, deleted bool, modelVersion *db.ModelVersion, prevModelVersion *db.ModelVersion, model *db.Model) error { logger := m.logger.WithField("func", "updateModelStatus") stats := calcModelVersionStatistics(modelVersion, deleted) logger.Debugf("Stats %+v modelVersion %+v prev model %+v", stats, modelVersion, prevModelVersion) updateModelState(isLatest, modelVersion, prevModelVersion, stats, deleted) + if err := m.store.models.Update(model); err != nil { + return fmt.Errorf("failed to update model: %w", err) + } + return nil } -func (m *MemoryStore) setModelGwStatusToTerminate(isLatest bool, modelVersion *db.ModelVersion) { +func (m *ModelServerStore) setModelGwStatusToTerminate(isLatest bool, modelVersion *db.ModelVersion) { if !isLatest { modelVersion.State.ModelGwState = db.ModelState_MODEL_STATE_TERMINATED modelVersion.State.ModelGwReason = "Not latest version" @@ -188,7 +192,7 @@ func (m *MemoryStore) setModelGwStatusToTerminate(isLatest bool, modelVersion *d modelVersion.State.ModelGwReason = "Model deleted" } -func (m *MemoryStore) UnloadModelGwVersionModels(modelKey string, version uint32) (bool, error) { +func (m *ModelServerStore) UnloadModelGwVersionModels(modelKey string, version uint32) (bool, error) { m.mu.Lock() defer m.mu.Unlock() diff --git a/scheduler/pkg/store/test_memory_hack.go b/scheduler/pkg/store/test_memory_hack.go index f78f9b37d9..4256a91011 100644 --- a/scheduler/pkg/store/test_memory_hack.go +++ b/scheduler/pkg/store/test_memory_hack.go @@ -21,7 +21,7 @@ import ( ) type TestMemoryStore struct { - *MemoryStore + *ModelServerStore } type ModelID struct { @@ -31,7 +31,7 @@ type ModelID struct { // NewTestMemory DO NOT USE for non-test code. This is purely meant for using in tests where an integration test is // wanted where the real memory store is needed, but the test needs the ability to directly manipulate the model -// statuses, which can't be achieved with MemoryStore. TestMemoryStore embeds MemoryStore and adds DirectlyUpdateModelStatus +// statuses, which can't be achieved with ModelServerStore. TestMemoryStore embeds ModelServerStore and adds DirectlyUpdateModelStatus // to modify the statuses. func NewTestMemory( t *testing.T, From 032fe1a7f808dcab9abf782405b90e92791c666d Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Fri, 2 Jan 2026 22:08:54 +0000 Subject: [PATCH 09/50] added save points --- scheduler/pkg/store/memory.go | 324 +++++++++++++++++++++------------- 1 file changed, 204 insertions(+), 120 deletions(-) diff --git a/scheduler/pkg/store/memory.go b/scheduler/pkg/store/memory.go index 5cf4920bde..a1e74f7f82 100644 --- a/scheduler/pkg/store/memory.go +++ b/scheduler/pkg/store/memory.go @@ -88,18 +88,18 @@ func (m *ModelServerStore) addModelVersionIfNotExists(req *agent.ModelVersion) ( return nil, nil, err } } - if existingModelVersion := model.GetVersion(req.GetVersion()); existingModelVersion == nil { + + var existingModelVersion *db.ModelVersion + if existingModelVersion = model.GetVersion(req.GetVersion()); existingModelVersion == nil { modelVersion := NewDefaultModelVersion(req.GetModel(), req.GetVersion()) model.Versions = append(model.Versions, modelVersion) sort.SliceStable(model.Versions, func(i, j int) bool { // resort model versions based on version number return model.Versions[i].GetVersion() < model.Versions[j].GetVersion() }) return model, modelVersion, nil - } else { - return model, existingModelVersion, nil } - // TODO pointer save issue + return model, existingModelVersion, nil } func (m *ModelServerStore) addNextModelVersion(model *db.Model, pbmodel *pb.Model) { @@ -265,13 +265,7 @@ func (m *ModelServerStore) removeModelImpl(req *pb.UnloadModelRequest) error { func (m *ModelServerStore) GetServers(shallow bool, modelDetails bool) ([]*db.Server, error) { m.mu.RLock() defer m.mu.RUnlock() - - servers, err := m.store.servers.List() - if err != nil { - return nil, fmt.Errorf("failed to list servers: %w", err) - } - - return servers, nil + return m.store.servers.List() } func (m *ModelServerStore) GetServer(serverKey string, _ bool, modelDetails bool) (*db.Server, *ServerStats, error) { @@ -409,10 +403,12 @@ func (m *ModelServerStore) updateLoadedModelsImpl( assignedReplicaIds[replica.ReplicaIdx] = struct{}{} } - // TODO shouldn't we be updating the store here??? for modelVersion.HasServer() && modelVersion.Server != serverKey { logger.Debugf("Adding new version as server changed to %s from %s", modelVersion.Server, serverKey) m.addNextModelVersion(model, model.Latest().ModelDefn) + if err := m.store.models.Update(model); err != nil { + return nil, fmt.Errorf("failed to update model %s: %w", modelKey, err) + } modelVersion = model.Latest() } @@ -425,7 +421,10 @@ func (m *ModelServerStore) updateLoadedModelsImpl( modelKey, modelVersion.Version, existingState.State.String(), serverKey, replicaIdx, ) modelVersion.SetReplicaState(int(replicaIdx), db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_REQUESTED, "") - m.updateReservedMemory(db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_REQUESTED, serverKey, int(replicaIdx), modelVersion.GetRequiredMemory()) + if err := m.updateReservedMemory(db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_REQUESTED, serverKey, + int(replicaIdx), modelVersion.GetRequiredMemory()); err != nil { + return nil, fmt.Errorf("failed to update server %s replica %d: %w", serverKey, replicaIdx, err) + } replicaStateUpdated = true } else { logger.Debugf( @@ -434,7 +433,10 @@ func (m *ModelServerStore) updateLoadedModelsImpl( ) if !existingState.State.AlreadyLoadingOrLoaded() { modelVersion.SetReplicaState(int(replicaIdx), db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_REQUESTED, "") - m.updateReservedMemory(db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_REQUESTED, serverKey, int(replicaIdx), modelVersion.GetRequiredMemory()) + if err := m.updateReservedMemory(db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_REQUESTED, serverKey, + int(replicaIdx), modelVersion.GetRequiredMemory()); err != nil { + return nil, fmt.Errorf("failed to update server %s replica %d: %w", serverKey, replicaIdx, err) + } replicaStateUpdated = true } } @@ -474,10 +476,9 @@ func (m *ModelServerStore) updateLoadedModelsImpl( } return &coordinator.ModelEventMsg{ - ModelName: modelVersion.ModelDefn.Meta.Name, - ModelVersion: modelVersion.GetVersion(), - }, - nil + ModelName: modelVersion.ModelDefn.Meta.Name, + ModelVersion: modelVersion.GetVersion(), + }, nil } logger.Debugf("Model status update not required for model %s server %s as no replicas were updated", modelKey, serverKey) @@ -504,7 +505,6 @@ func (m *ModelServerStore) unloadVersionModelsImpl(modelKey string, version uint defer m.mu.Unlock() // Validate - // TODO pointer issue model, err := m.store.models.Get(modelKey) if err != nil { return nil, false, fmt.Errorf("failed to find model %s", modelKey) @@ -534,9 +534,12 @@ func (m *ModelServerStore) unloadVersionModelsImpl(modelKey string, version uint modelKey, modelVersion.Server, replicaIdx, ) } + if updated { logger.Debugf("Calling update model status for model %s version %d", modelKey, version) - m.updateModelStatus(false, model.Deleted, modelVersion, model.GetLastAvailableModelVersion()) + if err := m.updateModelStatus(false, model.Deleted, modelVersion, model.GetLastAvailableModelVersion(), model); err != nil { + return nil, false, fmt.Errorf("failed to update model %s: %w", modelKey, err) + } return &coordinator.ModelEventMsg{ ModelName: modelVersion.ModelDefn.Meta.Name, ModelVersion: modelVersion.GetVersion(), @@ -557,7 +560,7 @@ func (m *ModelServerStore) UpdateModelState( reason string, runtimeInfo *pb.ModelRuntimeInfo, ) error { - modelEvt, serverEvt, err := m.updateModelStateImpl(modelKey, version, serverKey, replicaIdx, availableMemory, expectedState, desiredState, reason, runtimeInfo) + modelEvt, serverEvt, err := m.updateModelState(modelKey, version, serverKey, replicaIdx, availableMemory, expectedState, desiredState, reason, runtimeInfo) if err != nil { return err } @@ -576,7 +579,14 @@ func (m *ModelServerStore) UpdateModelState( return nil } -func (m *ModelServerStore) updateModelStateImpl( +type updateModelResult struct { + eventModel *coordinator.ModelEventMsg + eventServer *coordinator.ServerEventMsg + model *db.Model + server *db.Server +} + +func (m *ModelServerStore) updateModelState( modelKey string, version uint32, serverKey string, @@ -587,113 +597,158 @@ func (m *ModelServerStore) updateModelStateImpl( reason string, runtimeInfo *pb.ModelRuntimeInfo, ) (*coordinator.ModelEventMsg, *coordinator.ServerEventMsg, error) { - logger := m.logger.WithField("func", "updateModelStateImpl") m.mu.Lock() defer m.mu.Unlock() + res, err := m.updateModelStateImpl(modelKey, version, serverKey, replicaIdx, availableMemory, expectedState, desiredState, reason, runtimeInfo) + if err != nil { + return nil, nil, err + } + + // TODO should be in a transaction to rollback if updating servers fail + if err := m.store.models.Update(res.model); err != nil { + return nil, nil, fmt.Errorf("failed to update model %s: %w", modelKey, err) + } + if err := m.store.servers.Update(res.server); err != nil { + return nil, nil, fmt.Errorf("failed to update server %s: %w", serverKey, err) + } + + return res.eventModel, res.eventServer, nil +} + +func (m *ModelServerStore) updateModelStateImpl( + modelKey string, + version uint32, + serverKey string, + replicaIdx int, + availableMemory *uint64, + expectedState db.ModelReplicaState, + desiredState db.ModelReplicaState, + reason string, + runtimeInfo *pb.ModelRuntimeInfo, +) (*updateModelResult, error) { + logger := m.logger.WithField("func", "updateModelStateImpl") + // Validate model, modelVersion, server, err := m.getModelServer(modelKey, version, serverKey) if err != nil { - return nil, nil, err + return nil, err } modelVersion.UpdateRuntimeInfo(runtimeInfo) existingState := modelVersion.GetModelReplicaState(replicaIdx) - if existingState != expectedState { - return nil, nil, fmt.Errorf( + return nil, fmt.Errorf( "state mismatch for %s:%d expected state %s but was %s when trying to move to state %s", modelKey, version, expectedState.String(), existingState.String(), desiredState.String(), ) } - m.updateReservedMemory(desiredState, serverKey, replicaIdx, modelVersion.GetRequiredMemory()) + if err := m.updateReservedMemory(desiredState, serverKey, replicaIdx, modelVersion.GetRequiredMemory()); err != nil { + return nil, fmt.Errorf("failed to server %s replica %d: %w", serverKey, replicaIdx, err) + } deletedModelReplica := false - if existingState != desiredState { - latestModel := model.Latest() - isLatest := latestModel.GetVersion() == modelVersion.GetVersion() + if existingState == desiredState { + return &updateModelResult{ + model: model, + server: server, + }, nil + } - modelVersion.SetReplicaState(replicaIdx, desiredState, reason) - logger.Debugf( - "Setting model %s version %d on server %s replica %d to %s", - modelKey, version, serverKey, replicaIdx, desiredState.String(), - ) + latestModel := model.Latest() + isLatest := latestModel.GetVersion() == modelVersion.GetVersion() - // Update models loaded onto replica for relevant state - if desiredState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOADED || - desiredState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOADING || - desiredState == db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOADED || - desiredState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_FAILED { - // TODO pointer - server, err := m.store.servers.Get(serverKey) - // TODO handle error - if err == nil { - replica, ok := server.Replicas[int32(replicaIdx)] - if ok { - if desiredState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOADED || desiredState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOADING { - logger.Infof( - "Adding model %s(%d) to server %s replica %d list of loaded / loading models", - modelKey, version, serverKey, replicaIdx, - ) - replica.AddModelVersion(modelKey, version, desiredState) // we need to distinguish between loaded and loading - } else { - logger.Infof( - "Removing model %s(%d) from server %s replica %d list of loaded / loading models", - modelKey, version, serverKey, replicaIdx, - ) - // we could go from loaded -> unloaded or loading -> failed. in the case we have a failure then we just remove from loading - deletedModelReplica = true - replica.DeleteModelVersion(modelKey, version) - } - } + modelVersion.SetReplicaState(replicaIdx, desiredState, reason) + logger.Debugf( + "Setting model %s version %d on server %s replica %d to %s", + modelKey, version, serverKey, replicaIdx, desiredState.String(), + ) + + // Update models loaded onto replica for relevant state + if desiredState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOADED || + desiredState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOADING || + desiredState == db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOADED || + desiredState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_FAILED { + + replica, ok := server.Replicas[int32(replicaIdx)] + if ok { + if desiredState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOADED || desiredState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOADING { + logger.Infof( + "Adding model %s(%d) to server %s replica %d list of loaded / loading models", + modelKey, version, serverKey, replicaIdx, + ) + replica.AddModelVersion(modelKey, version, desiredState) // we need to distinguish between loaded and loading + } else { + logger.Infof( + "Removing model %s(%d) from server %s replica %d list of loaded / loading models", + modelKey, version, serverKey, replicaIdx, + ) + // we could go from loaded -> unloaded or loading -> failed. in the case we have a failure then we just remove from loading + deletedModelReplica = true + replica.DeleteModelVersion(modelKey, version) } } - if availableMemory != nil { - server.Replicas[int32(replicaIdx)].AvailableMemory = *availableMemory - } + } + if availableMemory != nil { + server.Replicas[int32(replicaIdx)].AvailableMemory = *availableMemory + } - m.updateModelStatus(isLatest, model.Deleted, modelVersion, model.GetLastAvailableModelVersion()) - modelEvt := &coordinator.ModelEventMsg{ - ModelName: modelVersion.ModelDefn.Meta.Name, - ModelVersion: modelVersion.GetVersion(), - } - if deletedModelReplica { - return modelEvt, - &coordinator.ServerEventMsg{ - ServerName: serverKey, - UpdateContext: coordinator.SERVER_SCALE_DOWN, - }, - nil - } else { - return modelEvt, - nil, - nil - } + if err := m.updateModelStatus(isLatest, model.Deleted, modelVersion, model.GetLastAvailableModelVersion(), model); err != nil { + return nil, fmt.Errorf("update model status failed: %w", err) } - return nil, nil, nil + modelEvt := &coordinator.ModelEventMsg{ + ModelName: modelVersion.ModelDefn.Meta.Name, + ModelVersion: modelVersion.GetVersion(), + } + if deletedModelReplica { + return &updateModelResult{ + eventModel: modelEvt, + eventServer: &coordinator.ServerEventMsg{ + ServerName: serverKey, + UpdateContext: coordinator.SERVER_SCALE_DOWN, + }, + model: model, + server: server, + }, nil + } + + return &updateModelResult{ + eventModel: modelEvt, + model: model, + server: server, + }, nil } func (m *ModelServerStore) updateReservedMemory( modelReplicaState db.ModelReplicaState, serverKey string, replicaIdx int, memBytes uint64, -) { +) error { // update reserved memory that is being used for sorting replicas // do we need to lock replica update? server, err := m.store.servers.Get(serverKey) - // TODO handle error?? - if err == nil { - replica, okReplica := server.Replicas[int32(replicaIdx)] - if okReplica { - if modelReplicaState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_REQUESTED { - replica.UpdateReservedMemory(memBytes, true) - } else if modelReplicaState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_FAILED || - modelReplicaState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOADED { - replica.UpdateReservedMemory(memBytes, false) - } + if err != nil { + return fmt.Errorf("get server %s failed: %w", serverKey, err) + } + + replica, okReplica := server.Replicas[int32(replicaIdx)] + update := false + if okReplica { + if modelReplicaState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_REQUESTED { + replica.UpdateReservedMemory(memBytes, true) + update = true + } else if modelReplicaState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_FAILED || + modelReplicaState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOADED { + replica.UpdateReservedMemory(memBytes, false) + update = true } } + + if update { + return m.store.servers.Update(server) + } + return nil } func (m *ModelServerStore) AddServerReplica(request *agent.AgentSubscribeRequest) error { @@ -721,7 +776,6 @@ func (m *ModelServerStore) addServerReplicaImpl(request *agent.AgentSubscribeReq m.mu.Lock() defer m.mu.Unlock() - // TODO pointer issue server, err := m.store.servers.Get(request.ServerName) if err != nil { if !errors.Is(err, ErrNotFound) { @@ -745,6 +799,10 @@ func (m *ModelServerStore) addServerReplicaImpl(request *agent.AgentSubscribeReq ) server.Replicas[int32(request.ReplicaIdx)] = serverReplica + if err := m.store.servers.Update(server); err != nil { + return nil, coordinator.ServerEventMsg{}, fmt.Errorf("failed to update server %s: %w", request.ServerName, err) + } + var evts []coordinator.ModelEventMsg for _, modelVersionReq := range request.LoadedModels { model, modelVersion, err := m.addModelVersionIfNotExists(modelVersionReq) @@ -753,7 +811,9 @@ func (m *ModelServerStore) addServerReplicaImpl(request *agent.AgentSubscribeReq } modelVersion.Replicas[int32(request.ReplicaIdx)] = &db.ReplicaStatus{State: db.ModelReplicaState_MODEL_REPLICA_STATE_LOADED} modelVersion.Server = request.ServerName - m.updateModelStatus(true, false, modelVersion, model.GetLastAvailableModelVersion()) + if err := m.updateModelStatus(true, false, modelVersion, model.GetLastAvailableModelVersion(), model); err != nil { + return nil, coordinator.ServerEventMsg{}, fmt.Errorf("failed to update model status: %w", err) + } evts = append(evts, coordinator.ModelEventMsg{ ModelName: modelVersion.ModelDefn.Meta.Name, ModelVersion: modelVersion.GetVersion(), @@ -798,14 +858,25 @@ func (m *ModelServerStore) removeServerReplicaImpl(serverName string, replicaIdx return nil, nil, fmt.Errorf("Failed to find replica %d for server %s", replicaIdx, serverName) } delete(server.Replicas, int32(replicaIdx)) + + if err := m.store.servers.Update(server); err != nil { + return nil, nil, fmt.Errorf("failed to update server %s: %w", serverName, err) + } + // TODO we should not reschedule models on servers with dedicated models, e.g. non shareable servers if len(server.Replicas) == 0 { if err := m.store.servers.Delete(serverName); err != nil { return nil, nil, fmt.Errorf("failed to delete server %s: %w", serverName, err) } } - loadedModelsRemoved, loadedEvts := m.removeModelfromServerReplica(serverReplica.LoadedModels, replicaIdx) - loadingModelsRemoved, loadingEtvs := m.removeModelfromServerReplica(serverReplica.LoadingModels, replicaIdx) + loadedModelsRemoved, loadedEvts, err := m.removeModelfromServerReplica(serverReplica.LoadedModels, replicaIdx) + if err != nil { + return nil, nil, fmt.Errorf("failed to remove loaded models from replica %d: %w", replicaIdx, err) + } + loadingModelsRemoved, loadingEtvs, err := m.removeModelfromServerReplica(serverReplica.LoadingModels, replicaIdx) + if err != nil { + return nil, nil, fmt.Errorf("failed to remove loading models from replica %d: %w", replicaIdx, err) + } modelsRemoved := append(loadedModelsRemoved, loadingModelsRemoved...) evts := append(loadedEvts, loadingEtvs...) @@ -813,7 +884,7 @@ func (m *ModelServerStore) removeServerReplicaImpl(serverName string, replicaIdx return modelsRemoved, evts, nil } -func (m *ModelServerStore) removeModelfromServerReplica(models []*db.ModelVersionID, replicaIdx int) ([]string, []coordinator.ModelEventMsg) { +func (m *ModelServerStore) removeModelfromServerReplica(models []*db.ModelVersionID, replicaIdx int) ([]string, []coordinator.ModelEventMsg, error) { logger := m.logger.WithField("func", "RemoveServerReplica") var modelNames []string var evts []coordinator.ModelEventMsg @@ -835,9 +906,11 @@ func (m *ModelServerStore) removeModelfromServerReplica(models []*db.ModelVersio modelVersion.SetReplicaState(replicaIdx, db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOADED, "model is removed when server replica was removed") m.LockModel(v.Name) - m.updateModelStatus( + if err := m.updateModelStatus( model.Latest().GetVersion() == modelVersion.GetVersion(), - model.Deleted, modelVersion, model.GetLastAvailableModelVersion()) + model.Deleted, modelVersion, model.GetLastAvailableModelVersion(), model); err != nil { + return nil, nil, fmt.Errorf("failed to update model %s status: %w", model.Name, err) + } m.UnlockModel(v.Name) // send an event to progress the deletion evts = append( @@ -849,28 +922,29 @@ func (m *ModelServerStore) removeModelfromServerReplica(models []*db.ModelVersio ) } else { modelNames = append(modelNames, v.Name) + if err := m.store.models.Update(model); err != nil { + return nil, nil, fmt.Errorf("failed to update model %s status: %w", model.Name, err) + } } } else { logger.Warnf("Can't find model version %s", v.String()) } } } - return modelNames, evts + return modelNames, evts, nil } func (m *ModelServerStore) DrainServerReplica(serverName string, replicaIdx int) ([]string, error) { m.mu.Lock() defer m.mu.Unlock() - return m.drainServerReplicaImpl(serverName, replicaIdx) } func (m *ModelServerStore) drainServerReplicaImpl(serverName string, replicaIdx int) ([]string, error) { logger := m.logger.WithField("func", "DrainServerReplica") - // TODO pointer issue server, err := m.store.servers.Get(serverName) if err != nil { - return nil, fmt.Errorf("Failed to find server %s", serverName) + return nil, fmt.Errorf("failed to find server %s", serverName) } serverReplica, ok := server.Replicas[int32(replicaIdx)] if !ok { @@ -880,11 +954,21 @@ func (m *ModelServerStore) drainServerReplicaImpl(serverName string, replicaIdx // we mark this server replica as draining so should not be used in future scheduling decisions serverReplica.IsDraining = true - loadedModels := m.findModelsToReSchedule(serverReplica.LoadingModels, replicaIdx) + if err := m.store.servers.Update(server); err != nil { + return nil, fmt.Errorf("failed to update server %s: %w", serverName, err) + } + + loadedModels, err := m.findModelsToReSchedule(serverReplica.LoadingModels, replicaIdx) + if err != nil { + return nil, err + } if len(loadedModels) > 0 { logger.WithField("models", loadedModels).Debug("Found loaded models to re-schedule") } - loadingModels := m.findModelsToReSchedule(serverReplica.LoadingModels, replicaIdx) + loadingModels, err := m.findModelsToReSchedule(serverReplica.LoadingModels, replicaIdx) + if err != nil { + return nil, err + } if len(loadingModels) > 0 { logger.WithField("models", loadingModels).Debug("Found loading models to re-schedule") } @@ -892,18 +976,19 @@ func (m *ModelServerStore) drainServerReplicaImpl(serverName string, replicaIdx return append(loadedModels, loadingModels...), nil } -func (m *ModelServerStore) findModelsToReSchedule(models []*db.ModelVersionID, replicaIdx int) []string { +func (m *ModelServerStore) findModelsToReSchedule(models []*db.ModelVersionID, replicaIdx int) ([]string, error) { logger := m.logger.WithField("func", "DrainServerReplica") modelsReSchedule := make([]string, 0) for _, v := range models { - // TODO pointer issue - // TODO handle error? model, err := m.store.models.Get(v.Name) if err == nil { modelVersion := model.GetVersion(v.Version) if modelVersion != nil { modelVersion.SetReplicaState(replicaIdx, db.ModelReplicaState_MODEL_REPLICA_STATE_DRAINING, "trigger to drain") + if err := m.store.models.Update(model); err != nil { + return nil, fmt.Errorf("failed update model %s: %w", model.Name, err) + } modelsReSchedule = append(modelsReSchedule, model.Name) continue } @@ -911,7 +996,7 @@ func (m *ModelServerStore) findModelsToReSchedule(models []*db.ModelVersionID, r } } - return modelsReSchedule + return modelsReSchedule, nil } func (m *ModelServerStore) ServerNotify(request *pb.ServerNotify) error { @@ -945,7 +1030,6 @@ func (m *ModelServerStore) ServerNotify(request *pb.ServerNotify) error { if err := m.store.servers.Update(server); err != nil { return fmt.Errorf("failed to update server %s: %w", request.Name, err) } - return nil } @@ -1012,12 +1096,10 @@ func (m *ModelServerStore) SetModelGwModelState(name string, versionNumber uint3 } func (m *ModelServerStore) setModelGwModelStateImpl(name string, versionNumber uint32, status db.ModelState, reason, source string) ([]*coordinator.ModelEventMsg, error) { - var evts []*coordinator.ModelEventMsg - m.mu.Lock() defer m.mu.Unlock() - // TODO pointer issue + var evts []*coordinator.ModelEventMsg model, err := m.store.models.Get(name) if err != nil { return nil, fmt.Errorf("failed to find model %s", name) @@ -1030,6 +1112,9 @@ func (m *ModelServerStore) setModelGwModelStateImpl(name string, versionNumber u if modelVersion.State.ModelGwState != status || modelVersion.State.ModelGwReason != reason { modelVersion.State.ModelGwState = status modelVersion.State.ModelGwReason = reason + if err := m.store.models.Update(model); err != nil { + return nil, fmt.Errorf("failed to update model %s: %w", name, err) + } evt := &coordinator.ModelEventMsg{ ModelName: modelVersion.ModelDefn.Meta.Name, ModelVersion: modelVersion.GetVersion(), @@ -1045,6 +1130,10 @@ func (m *ModelServerStore) EmitEvents() error { if err != nil { return err } + models, err := m.store.models.List() + if err != nil { + return err + } for _, server := range servers { for id := range server.Replicas { @@ -1057,11 +1146,6 @@ func (m *ModelServerStore) EmitEvents() error { } } - models, err := m.store.models.List() - if err != nil { - return err - } - for _, model := range models { latest := model.GetLastAvailableModelVersion() if latest == nil { From 79e41223236b78bb73279e3d9d5a0ba96131a94a Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Mon, 5 Jan 2026 12:43:47 +0000 Subject: [PATCH 10/50] fix nil pointer panics and align enum state values so operator can derive correct statuses of model/replicas --- apis/go/mlops/agent/agent.pb.go | 9 + apis/go/mlops/agent/agent_grpc.pb.go | 9 + apis/go/mlops/agent_debug/agent_debug.pb.go | 9 + .../mlops/agent_debug/agent_debug_grpc.pb.go | 9 + apis/go/mlops/chainer/chainer.pb.go | 9 + apis/go/mlops/chainer/chainer_grpc.pb.go | 9 + apis/go/mlops/health/health.pb.go | 9 + apis/go/mlops/health/health_grpc.pb.go | 9 + apis/go/mlops/proxy/proxy.pb.go | 9 + apis/go/mlops/proxy/proxy_grpc.pb.go | 9 + apis/go/mlops/scheduler/db/db.pb.go | 268 +++++++++--------- apis/go/mlops/scheduler/db/model_ext.go | 77 +++-- apis/go/mlops/scheduler/db/server_ext.go | 34 ++- apis/go/mlops/scheduler/scheduler.pb.go | 9 + apis/go/mlops/scheduler/scheduler_grpc.pb.go | 9 + apis/mlops/scheduler/db/db.proto | 48 ++-- scheduler/pkg/agent/server.go | 41 +-- scheduler/pkg/envoy/processor/incremental.go | 13 +- .../conflict_resolution.go | 26 +- scheduler/pkg/scheduler/cleaner/versions.go | 7 +- scheduler/pkg/scheduler/scheduler.go | 6 +- scheduler/pkg/server/server_status.go | 50 ++-- scheduler/pkg/store/experiment/state.go | 4 +- scheduler/pkg/store/experiment/store.go | 4 +- scheduler/pkg/store/in-memory/store.go | 13 +- scheduler/pkg/store/memory.go | 49 ++-- scheduler/pkg/store/memory_status.go | 40 +-- scheduler/pkg/store/mesh.go | 4 +- scheduler/pkg/store/mock/store.go | 9 + scheduler/pkg/store/pipeline/status.go | 2 +- scheduler/pkg/store/pipeline/store.go | 2 +- 31 files changed, 484 insertions(+), 321 deletions(-) diff --git a/apis/go/mlops/agent/agent.pb.go b/apis/go/mlops/agent/agent.pb.go index e3b764b9ec..855bd30c5b 100644 --- a/apis/go/mlops/agent/agent.pb.go +++ b/apis/go/mlops/agent/agent.pb.go @@ -1,3 +1,12 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed BY +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.34.2 diff --git a/apis/go/mlops/agent/agent_grpc.pb.go b/apis/go/mlops/agent/agent_grpc.pb.go index da32efd503..e82793408c 100644 --- a/apis/go/mlops/agent/agent_grpc.pb.go +++ b/apis/go/mlops/agent/agent_grpc.pb.go @@ -1,3 +1,12 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed BY +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + // Code generated by protoc-gen-go-grpc. DO NOT EDIT. // versions: // - protoc-gen-go-grpc v1.4.0 diff --git a/apis/go/mlops/agent_debug/agent_debug.pb.go b/apis/go/mlops/agent_debug/agent_debug.pb.go index cbff1f39c3..9605182cd8 100644 --- a/apis/go/mlops/agent_debug/agent_debug.pb.go +++ b/apis/go/mlops/agent_debug/agent_debug.pb.go @@ -1,3 +1,12 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed BY +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + // Debug service for agent replica // Code generated by protoc-gen-go. DO NOT EDIT. diff --git a/apis/go/mlops/agent_debug/agent_debug_grpc.pb.go b/apis/go/mlops/agent_debug/agent_debug_grpc.pb.go index 0dd305f361..dfb4269ef2 100644 --- a/apis/go/mlops/agent_debug/agent_debug_grpc.pb.go +++ b/apis/go/mlops/agent_debug/agent_debug_grpc.pb.go @@ -1,3 +1,12 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed BY +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + // Debug service for agent replica // Code generated by protoc-gen-go-grpc. DO NOT EDIT. diff --git a/apis/go/mlops/chainer/chainer.pb.go b/apis/go/mlops/chainer/chainer.pb.go index 0eec999a48..c31ae7b569 100644 --- a/apis/go/mlops/chainer/chainer.pb.go +++ b/apis/go/mlops/chainer/chainer.pb.go @@ -1,3 +1,12 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed BY +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.34.2 diff --git a/apis/go/mlops/chainer/chainer_grpc.pb.go b/apis/go/mlops/chainer/chainer_grpc.pb.go index 28defc847b..a4974064f7 100644 --- a/apis/go/mlops/chainer/chainer_grpc.pb.go +++ b/apis/go/mlops/chainer/chainer_grpc.pb.go @@ -1,3 +1,12 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed BY +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + // Code generated by protoc-gen-go-grpc. DO NOT EDIT. // versions: // - protoc-gen-go-grpc v1.4.0 diff --git a/apis/go/mlops/health/health.pb.go b/apis/go/mlops/health/health.pb.go index 7340c165d2..11ef4b5936 100644 --- a/apis/go/mlops/health/health.pb.go +++ b/apis/go/mlops/health/health.pb.go @@ -1,3 +1,12 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed BY +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.34.2 diff --git a/apis/go/mlops/health/health_grpc.pb.go b/apis/go/mlops/health/health_grpc.pb.go index a5d1c62e18..7c5c501b13 100644 --- a/apis/go/mlops/health/health_grpc.pb.go +++ b/apis/go/mlops/health/health_grpc.pb.go @@ -1,3 +1,12 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed BY +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + // Code generated by protoc-gen-go-grpc. DO NOT EDIT. // versions: // - protoc-gen-go-grpc v1.4.0 diff --git a/apis/go/mlops/proxy/proxy.pb.go b/apis/go/mlops/proxy/proxy.pb.go index bda6bf09ff..f35d13234d 100644 --- a/apis/go/mlops/proxy/proxy.pb.go +++ b/apis/go/mlops/proxy/proxy.pb.go @@ -1,3 +1,12 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed BY +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.34.2 diff --git a/apis/go/mlops/proxy/proxy_grpc.pb.go b/apis/go/mlops/proxy/proxy_grpc.pb.go index de21a28d47..528dbec929 100644 --- a/apis/go/mlops/proxy/proxy_grpc.pb.go +++ b/apis/go/mlops/proxy/proxy_grpc.pb.go @@ -1,3 +1,12 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed BY +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + // Code generated by protoc-gen-go-grpc. DO NOT EDIT. // versions: // - protoc-gen-go-grpc v1.4.0 diff --git a/apis/go/mlops/scheduler/db/db.pb.go b/apis/go/mlops/scheduler/db/db.pb.go index d95f186522..ff34ef0d58 100644 --- a/apis/go/mlops/scheduler/db/db.pb.go +++ b/apis/go/mlops/scheduler/db/db.pb.go @@ -1,3 +1,12 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed BY +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.34.2 @@ -26,46 +35,46 @@ const ( type ModelState int32 const ( - ModelState_MODEL_STATE_UNKNOWN ModelState = 0 - ModelState_MODEL_STATE_PROGRESSING ModelState = 1 - ModelState_MODEL_STATE_AVAILABLE ModelState = 2 - ModelState_MODEL_STATE_FAILED ModelState = 3 - ModelState_MODEL_STATE_TERMINATING ModelState = 4 - ModelState_MODEL_STATE_TERMINATED ModelState = 5 - ModelState_MODEL_STATE_TERMINATE_FAILED ModelState = 6 - ModelState_MODEL_STATE_SCHEDULE_FAILED ModelState = 7 - ModelState_MODEL_STATE_SCALED_DOWN ModelState = 8 - ModelState_MODEL_STATE_CREATE ModelState = 9 - ModelState_MODEL_STATE_TERMINATE ModelState = 10 + ModelState_ModelStateUnknown ModelState = 0 + ModelState_ModelProgressing ModelState = 1 + ModelState_ModelAvailable ModelState = 2 + ModelState_ModelFailed ModelState = 3 + ModelState_ModelTerminating ModelState = 4 + ModelState_ModelTerminated ModelState = 5 + ModelState_ModelTerminateFailed ModelState = 6 + ModelState_ScheduleFailed ModelState = 7 + ModelState_ModelScaledDown ModelState = 8 + ModelState_ModelCreate ModelState = 9 + ModelState_ModelTerminate ModelState = 10 ) // Enum value maps for ModelState. var ( ModelState_name = map[int32]string{ - 0: "MODEL_STATE_UNKNOWN", - 1: "MODEL_STATE_PROGRESSING", - 2: "MODEL_STATE_AVAILABLE", - 3: "MODEL_STATE_FAILED", - 4: "MODEL_STATE_TERMINATING", - 5: "MODEL_STATE_TERMINATED", - 6: "MODEL_STATE_TERMINATE_FAILED", - 7: "MODEL_STATE_SCHEDULE_FAILED", - 8: "MODEL_STATE_SCALED_DOWN", - 9: "MODEL_STATE_CREATE", - 10: "MODEL_STATE_TERMINATE", + 0: "ModelStateUnknown", + 1: "ModelProgressing", + 2: "ModelAvailable", + 3: "ModelFailed", + 4: "ModelTerminating", + 5: "ModelTerminated", + 6: "ModelTerminateFailed", + 7: "ScheduleFailed", + 8: "ModelScaledDown", + 9: "ModelCreate", + 10: "ModelTerminate", } ModelState_value = map[string]int32{ - "MODEL_STATE_UNKNOWN": 0, - "MODEL_STATE_PROGRESSING": 1, - "MODEL_STATE_AVAILABLE": 2, - "MODEL_STATE_FAILED": 3, - "MODEL_STATE_TERMINATING": 4, - "MODEL_STATE_TERMINATED": 5, - "MODEL_STATE_TERMINATE_FAILED": 6, - "MODEL_STATE_SCHEDULE_FAILED": 7, - "MODEL_STATE_SCALED_DOWN": 8, - "MODEL_STATE_CREATE": 9, - "MODEL_STATE_TERMINATE": 10, + "ModelStateUnknown": 0, + "ModelProgressing": 1, + "ModelAvailable": 2, + "ModelFailed": 3, + "ModelTerminating": 4, + "ModelTerminated": 5, + "ModelTerminateFailed": 6, + "ScheduleFailed": 7, + "ModelScaledDown": 8, + "ModelCreate": 9, + "ModelTerminate": 10, } ) @@ -100,52 +109,52 @@ func (ModelState) EnumDescriptor() ([]byte, []int) { type ModelReplicaState int32 const ( - ModelReplicaState_MODEL_REPLICA_STATE_UNKNOWN ModelReplicaState = 0 - ModelReplicaState_MODEL_REPLICA_STATE_LOAD_REQUESTED ModelReplicaState = 1 - ModelReplicaState_MODEL_REPLICA_STATE_LOADING ModelReplicaState = 2 - ModelReplicaState_MODEL_REPLICA_STATE_LOADED ModelReplicaState = 3 - ModelReplicaState_MODEL_REPLICA_STATE_LOAD_FAILED ModelReplicaState = 4 - ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_ENVOY_REQUESTED ModelReplicaState = 5 - ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_REQUESTED ModelReplicaState = 6 - ModelReplicaState_MODEL_REPLICA_STATE_UNLOADING ModelReplicaState = 7 - ModelReplicaState_MODEL_REPLICA_STATE_UNLOADED ModelReplicaState = 8 - ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_FAILED ModelReplicaState = 9 - ModelReplicaState_MODEL_REPLICA_STATE_AVAILABLE ModelReplicaState = 10 - ModelReplicaState_MODEL_REPLICA_STATE_LOADED_UNAVAILABLE ModelReplicaState = 11 - ModelReplicaState_MODEL_REPLICA_STATE_DRAINING ModelReplicaState = 12 + ModelReplicaState_ModelReplicaStateUnknown ModelReplicaState = 0 + ModelReplicaState_LoadRequested ModelReplicaState = 1 + ModelReplicaState_Loading ModelReplicaState = 2 + ModelReplicaState_Loaded ModelReplicaState = 3 + ModelReplicaState_LoadFailed ModelReplicaState = 4 + ModelReplicaState_UnloadRequested ModelReplicaState = 5 + ModelReplicaState_Unloading ModelReplicaState = 6 + ModelReplicaState_Unloaded ModelReplicaState = 7 + ModelReplicaState_UnloadFailed ModelReplicaState = 8 + ModelReplicaState_Available ModelReplicaState = 9 + ModelReplicaState_LoadedUnavailable ModelReplicaState = 10 + ModelReplicaState_UnloadEnvoyRequested ModelReplicaState = 11 + ModelReplicaState_Draining ModelReplicaState = 12 ) // Enum value maps for ModelReplicaState. var ( ModelReplicaState_name = map[int32]string{ - 0: "MODEL_REPLICA_STATE_UNKNOWN", - 1: "MODEL_REPLICA_STATE_LOAD_REQUESTED", - 2: "MODEL_REPLICA_STATE_LOADING", - 3: "MODEL_REPLICA_STATE_LOADED", - 4: "MODEL_REPLICA_STATE_LOAD_FAILED", - 5: "MODEL_REPLICA_STATE_UNLOAD_ENVOY_REQUESTED", - 6: "MODEL_REPLICA_STATE_UNLOAD_REQUESTED", - 7: "MODEL_REPLICA_STATE_UNLOADING", - 8: "MODEL_REPLICA_STATE_UNLOADED", - 9: "MODEL_REPLICA_STATE_UNLOAD_FAILED", - 10: "MODEL_REPLICA_STATE_AVAILABLE", - 11: "MODEL_REPLICA_STATE_LOADED_UNAVAILABLE", - 12: "MODEL_REPLICA_STATE_DRAINING", + 0: "ModelReplicaStateUnknown", + 1: "LoadRequested", + 2: "Loading", + 3: "Loaded", + 4: "LoadFailed", + 5: "UnloadRequested", + 6: "Unloading", + 7: "Unloaded", + 8: "UnloadFailed", + 9: "Available", + 10: "LoadedUnavailable", + 11: "UnloadEnvoyRequested", + 12: "Draining", } ModelReplicaState_value = map[string]int32{ - "MODEL_REPLICA_STATE_UNKNOWN": 0, - "MODEL_REPLICA_STATE_LOAD_REQUESTED": 1, - "MODEL_REPLICA_STATE_LOADING": 2, - "MODEL_REPLICA_STATE_LOADED": 3, - "MODEL_REPLICA_STATE_LOAD_FAILED": 4, - "MODEL_REPLICA_STATE_UNLOAD_ENVOY_REQUESTED": 5, - "MODEL_REPLICA_STATE_UNLOAD_REQUESTED": 6, - "MODEL_REPLICA_STATE_UNLOADING": 7, - "MODEL_REPLICA_STATE_UNLOADED": 8, - "MODEL_REPLICA_STATE_UNLOAD_FAILED": 9, - "MODEL_REPLICA_STATE_AVAILABLE": 10, - "MODEL_REPLICA_STATE_LOADED_UNAVAILABLE": 11, - "MODEL_REPLICA_STATE_DRAINING": 12, + "ModelReplicaStateUnknown": 0, + "LoadRequested": 1, + "Loading": 2, + "Loaded": 3, + "LoadFailed": 4, + "UnloadRequested": 5, + "Unloading": 6, + "Unloaded": 7, + "UnloadFailed": 8, + "Available": 9, + "LoadedUnavailable": 10, + "UnloadEnvoyRequested": 11, + "Draining": 12, } ) @@ -352,7 +361,7 @@ func (x *ReplicaStatus) GetState() ModelReplicaState { if x != nil { return x.State } - return ModelReplicaState_MODEL_REPLICA_STATE_UNKNOWN + return ModelReplicaState_ModelReplicaStateUnknown } func (x *ReplicaStatus) GetReason() string { @@ -421,14 +430,14 @@ func (x *ModelStatus) GetState() ModelState { if x != nil { return x.State } - return ModelState_MODEL_STATE_UNKNOWN + return ModelState_ModelStateUnknown } func (x *ModelStatus) GetModelGwState() ModelState { if x != nil { return x.ModelGwState } - return ModelState_MODEL_STATE_UNKNOWN + return ModelState_ModelStateUnknown } func (x *ModelStatus) GetReason() string { @@ -775,15 +784,14 @@ type ServerReplica struct { sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields - InferenceSvc string `protobuf:"bytes,1,opt,name=inference_svc,json=inferenceSvc,proto3" json:"inference_svc,omitempty"` - InferenceHttpPort int32 `protobuf:"varint,2,opt,name=inference_http_port,json=inferenceHttpPort,proto3" json:"inference_http_port,omitempty"` - InferenceGrpcPort int32 `protobuf:"varint,3,opt,name=inference_grpc_port,json=inferenceGrpcPort,proto3" json:"inference_grpc_port,omitempty"` - ServerName string `protobuf:"bytes,4,opt,name=server_name,json=serverName,proto3" json:"server_name,omitempty"` - ReplicaIdx int32 `protobuf:"varint,5,opt,name=replica_idx,json=replicaIdx,proto3" json:"replica_idx,omitempty"` - Capabilities []string `protobuf:"bytes,6,rep,name=capabilities,proto3" json:"capabilities,omitempty"` - Memory uint64 `protobuf:"varint,7,opt,name=memory,proto3" json:"memory,omitempty"` - AvailableMemory uint64 `protobuf:"varint,8,opt,name=available_memory,json=availableMemory,proto3" json:"available_memory,omitempty"` - // Using repeated messages since proto doesn't support complex map keys + InferenceSvc string `protobuf:"bytes,1,opt,name=inference_svc,json=inferenceSvc,proto3" json:"inference_svc,omitempty"` + InferenceHttpPort int32 `protobuf:"varint,2,opt,name=inference_http_port,json=inferenceHttpPort,proto3" json:"inference_http_port,omitempty"` + InferenceGrpcPort int32 `protobuf:"varint,3,opt,name=inference_grpc_port,json=inferenceGrpcPort,proto3" json:"inference_grpc_port,omitempty"` + ServerName string `protobuf:"bytes,4,opt,name=server_name,json=serverName,proto3" json:"server_name,omitempty"` + ReplicaIdx int32 `protobuf:"varint,5,opt,name=replica_idx,json=replicaIdx,proto3" json:"replica_idx,omitempty"` + Capabilities []string `protobuf:"bytes,6,rep,name=capabilities,proto3" json:"capabilities,omitempty"` + Memory uint64 `protobuf:"varint,7,opt,name=memory,proto3" json:"memory,omitempty"` + AvailableMemory uint64 `protobuf:"varint,8,opt,name=available_memory,json=availableMemory,proto3" json:"available_memory,omitempty"` LoadedModels []*ModelVersionID `protobuf:"bytes,9,rep,name=loaded_models,json=loadedModels,proto3" json:"loaded_models,omitempty"` LoadingModels []*ModelVersionID `protobuf:"bytes,10,rep,name=loading_models,json=loadingModels,proto3" json:"loading_models,omitempty"` OverCommitPercentage uint32 `protobuf:"varint,11,opt,name=over_commit_percentage,json=overCommitPercentage,proto3" json:"over_commit_percentage,omitempty"` @@ -1099,63 +1107,43 @@ var file_mlops_scheduler_db_db_proto_rawDesc = []byte{ 0x6f, 0x64, 0x65, 0x6c, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x12, 0x10, 0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x6b, 0x65, 0x79, 0x12, 0x14, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x08, 0x52, 0x05, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x2a, 0xc1, 0x02, 0x0a, 0x0a, 0x4d, 0x6f, 0x64, 0x65, 0x6c, - 0x53, 0x74, 0x61, 0x74, 0x65, 0x12, 0x17, 0x0a, 0x13, 0x4d, 0x4f, 0x44, 0x45, 0x4c, 0x5f, 0x53, - 0x54, 0x41, 0x54, 0x45, 0x5f, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x00, 0x12, 0x1b, - 0x0a, 0x17, 0x4d, 0x4f, 0x44, 0x45, 0x4c, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x50, 0x52, - 0x4f, 0x47, 0x52, 0x45, 0x53, 0x53, 0x49, 0x4e, 0x47, 0x10, 0x01, 0x12, 0x19, 0x0a, 0x15, 0x4d, - 0x4f, 0x44, 0x45, 0x4c, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x41, 0x56, 0x41, 0x49, 0x4c, - 0x41, 0x42, 0x4c, 0x45, 0x10, 0x02, 0x12, 0x16, 0x0a, 0x12, 0x4d, 0x4f, 0x44, 0x45, 0x4c, 0x5f, - 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x46, 0x41, 0x49, 0x4c, 0x45, 0x44, 0x10, 0x03, 0x12, 0x1b, - 0x0a, 0x17, 0x4d, 0x4f, 0x44, 0x45, 0x4c, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x54, 0x45, - 0x52, 0x4d, 0x49, 0x4e, 0x41, 0x54, 0x49, 0x4e, 0x47, 0x10, 0x04, 0x12, 0x1a, 0x0a, 0x16, 0x4d, - 0x4f, 0x44, 0x45, 0x4c, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x54, 0x45, 0x52, 0x4d, 0x49, - 0x4e, 0x41, 0x54, 0x45, 0x44, 0x10, 0x05, 0x12, 0x20, 0x0a, 0x1c, 0x4d, 0x4f, 0x44, 0x45, 0x4c, - 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x54, 0x45, 0x52, 0x4d, 0x49, 0x4e, 0x41, 0x54, 0x45, - 0x5f, 0x46, 0x41, 0x49, 0x4c, 0x45, 0x44, 0x10, 0x06, 0x12, 0x1f, 0x0a, 0x1b, 0x4d, 0x4f, 0x44, - 0x45, 0x4c, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x53, 0x43, 0x48, 0x45, 0x44, 0x55, 0x4c, - 0x45, 0x5f, 0x46, 0x41, 0x49, 0x4c, 0x45, 0x44, 0x10, 0x07, 0x12, 0x1b, 0x0a, 0x17, 0x4d, 0x4f, - 0x44, 0x45, 0x4c, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x53, 0x43, 0x41, 0x4c, 0x45, 0x44, - 0x5f, 0x44, 0x4f, 0x57, 0x4e, 0x10, 0x08, 0x12, 0x16, 0x0a, 0x12, 0x4d, 0x4f, 0x44, 0x45, 0x4c, - 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x43, 0x52, 0x45, 0x41, 0x54, 0x45, 0x10, 0x09, 0x12, - 0x19, 0x0a, 0x15, 0x4d, 0x4f, 0x44, 0x45, 0x4c, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x54, - 0x45, 0x52, 0x4d, 0x49, 0x4e, 0x41, 0x54, 0x45, 0x10, 0x0a, 0x2a, 0xf9, 0x03, 0x0a, 0x11, 0x4d, + 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x2a, 0xf1, 0x01, 0x0a, 0x0a, 0x4d, 0x6f, 0x64, 0x65, 0x6c, + 0x53, 0x74, 0x61, 0x74, 0x65, 0x12, 0x15, 0x0a, 0x11, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x53, 0x74, + 0x61, 0x74, 0x65, 0x55, 0x6e, 0x6b, 0x6e, 0x6f, 0x77, 0x6e, 0x10, 0x00, 0x12, 0x14, 0x0a, 0x10, + 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x50, 0x72, 0x6f, 0x67, 0x72, 0x65, 0x73, 0x73, 0x69, 0x6e, 0x67, + 0x10, 0x01, 0x12, 0x12, 0x0a, 0x0e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x41, 0x76, 0x61, 0x69, 0x6c, + 0x61, 0x62, 0x6c, 0x65, 0x10, 0x02, 0x12, 0x0f, 0x0a, 0x0b, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x46, + 0x61, 0x69, 0x6c, 0x65, 0x64, 0x10, 0x03, 0x12, 0x14, 0x0a, 0x10, 0x4d, 0x6f, 0x64, 0x65, 0x6c, + 0x54, 0x65, 0x72, 0x6d, 0x69, 0x6e, 0x61, 0x74, 0x69, 0x6e, 0x67, 0x10, 0x04, 0x12, 0x13, 0x0a, + 0x0f, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x54, 0x65, 0x72, 0x6d, 0x69, 0x6e, 0x61, 0x74, 0x65, 0x64, + 0x10, 0x05, 0x12, 0x18, 0x0a, 0x14, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x54, 0x65, 0x72, 0x6d, 0x69, + 0x6e, 0x61, 0x74, 0x65, 0x46, 0x61, 0x69, 0x6c, 0x65, 0x64, 0x10, 0x06, 0x12, 0x12, 0x0a, 0x0e, + 0x53, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x46, 0x61, 0x69, 0x6c, 0x65, 0x64, 0x10, 0x07, + 0x12, 0x13, 0x0a, 0x0f, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x53, 0x63, 0x61, 0x6c, 0x65, 0x64, 0x44, + 0x6f, 0x77, 0x6e, 0x10, 0x08, 0x12, 0x0f, 0x0a, 0x0b, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x43, 0x72, + 0x65, 0x61, 0x74, 0x65, 0x10, 0x09, 0x12, 0x12, 0x0a, 0x0e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x54, + 0x65, 0x72, 0x6d, 0x69, 0x6e, 0x61, 0x74, 0x65, 0x10, 0x0a, 0x2a, 0xff, 0x01, 0x0a, 0x11, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x53, 0x74, 0x61, 0x74, 0x65, - 0x12, 0x1f, 0x0a, 0x1b, 0x4d, 0x4f, 0x44, 0x45, 0x4c, 0x5f, 0x52, 0x45, 0x50, 0x4c, 0x49, 0x43, - 0x41, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, - 0x00, 0x12, 0x26, 0x0a, 0x22, 0x4d, 0x4f, 0x44, 0x45, 0x4c, 0x5f, 0x52, 0x45, 0x50, 0x4c, 0x49, - 0x43, 0x41, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x4c, 0x4f, 0x41, 0x44, 0x5f, 0x52, 0x45, - 0x51, 0x55, 0x45, 0x53, 0x54, 0x45, 0x44, 0x10, 0x01, 0x12, 0x1f, 0x0a, 0x1b, 0x4d, 0x4f, 0x44, - 0x45, 0x4c, 0x5f, 0x52, 0x45, 0x50, 0x4c, 0x49, 0x43, 0x41, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, - 0x5f, 0x4c, 0x4f, 0x41, 0x44, 0x49, 0x4e, 0x47, 0x10, 0x02, 0x12, 0x1e, 0x0a, 0x1a, 0x4d, 0x4f, - 0x44, 0x45, 0x4c, 0x5f, 0x52, 0x45, 0x50, 0x4c, 0x49, 0x43, 0x41, 0x5f, 0x53, 0x54, 0x41, 0x54, - 0x45, 0x5f, 0x4c, 0x4f, 0x41, 0x44, 0x45, 0x44, 0x10, 0x03, 0x12, 0x23, 0x0a, 0x1f, 0x4d, 0x4f, - 0x44, 0x45, 0x4c, 0x5f, 0x52, 0x45, 0x50, 0x4c, 0x49, 0x43, 0x41, 0x5f, 0x53, 0x54, 0x41, 0x54, - 0x45, 0x5f, 0x4c, 0x4f, 0x41, 0x44, 0x5f, 0x46, 0x41, 0x49, 0x4c, 0x45, 0x44, 0x10, 0x04, 0x12, - 0x2e, 0x0a, 0x2a, 0x4d, 0x4f, 0x44, 0x45, 0x4c, 0x5f, 0x52, 0x45, 0x50, 0x4c, 0x49, 0x43, 0x41, - 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x55, 0x4e, 0x4c, 0x4f, 0x41, 0x44, 0x5f, 0x45, 0x4e, - 0x56, 0x4f, 0x59, 0x5f, 0x52, 0x45, 0x51, 0x55, 0x45, 0x53, 0x54, 0x45, 0x44, 0x10, 0x05, 0x12, - 0x28, 0x0a, 0x24, 0x4d, 0x4f, 0x44, 0x45, 0x4c, 0x5f, 0x52, 0x45, 0x50, 0x4c, 0x49, 0x43, 0x41, - 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x55, 0x4e, 0x4c, 0x4f, 0x41, 0x44, 0x5f, 0x52, 0x45, - 0x51, 0x55, 0x45, 0x53, 0x54, 0x45, 0x44, 0x10, 0x06, 0x12, 0x21, 0x0a, 0x1d, 0x4d, 0x4f, 0x44, - 0x45, 0x4c, 0x5f, 0x52, 0x45, 0x50, 0x4c, 0x49, 0x43, 0x41, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, - 0x5f, 0x55, 0x4e, 0x4c, 0x4f, 0x41, 0x44, 0x49, 0x4e, 0x47, 0x10, 0x07, 0x12, 0x20, 0x0a, 0x1c, - 0x4d, 0x4f, 0x44, 0x45, 0x4c, 0x5f, 0x52, 0x45, 0x50, 0x4c, 0x49, 0x43, 0x41, 0x5f, 0x53, 0x54, - 0x41, 0x54, 0x45, 0x5f, 0x55, 0x4e, 0x4c, 0x4f, 0x41, 0x44, 0x45, 0x44, 0x10, 0x08, 0x12, 0x25, - 0x0a, 0x21, 0x4d, 0x4f, 0x44, 0x45, 0x4c, 0x5f, 0x52, 0x45, 0x50, 0x4c, 0x49, 0x43, 0x41, 0x5f, - 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x55, 0x4e, 0x4c, 0x4f, 0x41, 0x44, 0x5f, 0x46, 0x41, 0x49, - 0x4c, 0x45, 0x44, 0x10, 0x09, 0x12, 0x21, 0x0a, 0x1d, 0x4d, 0x4f, 0x44, 0x45, 0x4c, 0x5f, 0x52, - 0x45, 0x50, 0x4c, 0x49, 0x43, 0x41, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x41, 0x56, 0x41, - 0x49, 0x4c, 0x41, 0x42, 0x4c, 0x45, 0x10, 0x0a, 0x12, 0x2a, 0x0a, 0x26, 0x4d, 0x4f, 0x44, 0x45, - 0x4c, 0x5f, 0x52, 0x45, 0x50, 0x4c, 0x49, 0x43, 0x41, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, - 0x4c, 0x4f, 0x41, 0x44, 0x45, 0x44, 0x5f, 0x55, 0x4e, 0x41, 0x56, 0x41, 0x49, 0x4c, 0x41, 0x42, - 0x4c, 0x45, 0x10, 0x0b, 0x12, 0x20, 0x0a, 0x1c, 0x4d, 0x4f, 0x44, 0x45, 0x4c, 0x5f, 0x52, 0x45, - 0x50, 0x4c, 0x49, 0x43, 0x41, 0x5f, 0x53, 0x54, 0x41, 0x54, 0x45, 0x5f, 0x44, 0x52, 0x41, 0x49, - 0x4e, 0x49, 0x4e, 0x47, 0x10, 0x0c, 0x42, 0x3f, 0x5a, 0x3d, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, - 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x69, 0x6f, 0x2f, 0x73, 0x65, - 0x6c, 0x64, 0x6f, 0x6e, 0x2d, 0x63, 0x6f, 0x72, 0x65, 0x2f, 0x61, 0x70, 0x69, 0x73, 0x2f, 0x67, - 0x6f, 0x2f, 0x76, 0x32, 0x2f, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2f, 0x73, 0x63, 0x68, 0x65, 0x64, - 0x75, 0x6c, 0x65, 0x72, 0x2f, 0x64, 0x62, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, + 0x12, 0x1c, 0x0a, 0x18, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, + 0x53, 0x74, 0x61, 0x74, 0x65, 0x55, 0x6e, 0x6b, 0x6e, 0x6f, 0x77, 0x6e, 0x10, 0x00, 0x12, 0x11, + 0x0a, 0x0d, 0x4c, 0x6f, 0x61, 0x64, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x65, 0x64, 0x10, + 0x01, 0x12, 0x0b, 0x0a, 0x07, 0x4c, 0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x10, 0x02, 0x12, 0x0a, + 0x0a, 0x06, 0x4c, 0x6f, 0x61, 0x64, 0x65, 0x64, 0x10, 0x03, 0x12, 0x0e, 0x0a, 0x0a, 0x4c, 0x6f, + 0x61, 0x64, 0x46, 0x61, 0x69, 0x6c, 0x65, 0x64, 0x10, 0x04, 0x12, 0x13, 0x0a, 0x0f, 0x55, 0x6e, + 0x6c, 0x6f, 0x61, 0x64, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x65, 0x64, 0x10, 0x05, 0x12, + 0x0d, 0x0a, 0x09, 0x55, 0x6e, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x10, 0x06, 0x12, 0x0c, + 0x0a, 0x08, 0x55, 0x6e, 0x6c, 0x6f, 0x61, 0x64, 0x65, 0x64, 0x10, 0x07, 0x12, 0x10, 0x0a, 0x0c, + 0x55, 0x6e, 0x6c, 0x6f, 0x61, 0x64, 0x46, 0x61, 0x69, 0x6c, 0x65, 0x64, 0x10, 0x08, 0x12, 0x0d, + 0x0a, 0x09, 0x41, 0x76, 0x61, 0x69, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x10, 0x09, 0x12, 0x15, 0x0a, + 0x11, 0x4c, 0x6f, 0x61, 0x64, 0x65, 0x64, 0x55, 0x6e, 0x61, 0x76, 0x61, 0x69, 0x6c, 0x61, 0x62, + 0x6c, 0x65, 0x10, 0x0a, 0x12, 0x18, 0x0a, 0x14, 0x55, 0x6e, 0x6c, 0x6f, 0x61, 0x64, 0x45, 0x6e, + 0x76, 0x6f, 0x79, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x65, 0x64, 0x10, 0x0b, 0x12, 0x0c, + 0x0a, 0x08, 0x44, 0x72, 0x61, 0x69, 0x6e, 0x69, 0x6e, 0x67, 0x10, 0x0c, 0x42, 0x3f, 0x5a, 0x3d, + 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x73, 0x65, 0x6c, 0x64, 0x6f, + 0x6e, 0x69, 0x6f, 0x2f, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2d, 0x63, 0x6f, 0x72, 0x65, 0x2f, + 0x61, 0x70, 0x69, 0x73, 0x2f, 0x67, 0x6f, 0x2f, 0x76, 0x32, 0x2f, 0x6d, 0x6c, 0x6f, 0x70, 0x73, + 0x2f, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2f, 0x64, 0x62, 0x62, 0x06, 0x70, + 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( diff --git a/apis/go/mlops/scheduler/db/model_ext.go b/apis/go/mlops/scheduler/db/model_ext.go index 5f05b89b10..6e032d7f06 100644 --- a/apis/go/mlops/scheduler/db/model_ext.go +++ b/apis/go/mlops/scheduler/db/model_ext.go @@ -1,3 +1,12 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed BY +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + package db import ( @@ -10,12 +19,12 @@ func (m *ModelVersion) GetAssignment() []int { var draining []int for k, v := range m.Replicas { - if v.State == ModelReplicaState_MODEL_REPLICA_STATE_LOADED || - v.State == ModelReplicaState_MODEL_REPLICA_STATE_AVAILABLE || - v.State == ModelReplicaState_MODEL_REPLICA_STATE_LOADED_UNAVAILABLE { + if v.State == ModelReplicaState_Loaded || + v.State == ModelReplicaState_Available || + v.State == ModelReplicaState_LoadedUnavailable { assignment = append(assignment, int(k)) } - if v.State == ModelReplicaState_MODEL_REPLICA_STATE_DRAINING { + if v.State == ModelReplicaState_Draining { draining = append(draining, int(k)) } } @@ -113,7 +122,7 @@ func (m *Model) getLastModelGwAvailableModelIdx() int { } lastAvailableIdx := -1 for idx, mv := range m.Versions { - if mv.State.ModelGwState == ModelState_MODEL_STATE_AVAILABLE { + if mv.State.ModelGwState == ModelState_ModelAvailable { lastAvailableIdx = idx } } @@ -137,7 +146,7 @@ func (m *Model) getLastAvailableModelIdx() int { } lastAvailableIdx := -1 for idx, mv := range m.Versions { - if mv.State.State == ModelState_MODEL_STATE_AVAILABLE { + if mv.State.State == ModelState_ModelAvailable { lastAvailableIdx = idx } } @@ -190,6 +199,7 @@ func getInstanceCount(modelRuntimeInfo *pb.ModelRuntimeInfo) uint64 { } func (m *ModelVersion) SetReplicaState(replicaIdx int, state ModelReplicaState, reason string) { + m.initReplicasIfEmpty() m.Replicas[int32(replicaIdx)] = &ReplicaStatus{State: state, Timestamp: timestamppb.Now(), Reason: reason} } @@ -199,10 +209,17 @@ func (m *ModelVersion) UpdateRuntimeInfo(runtimeInfo *pb.ModelRuntimeInfo) { } } +func (m *ModelVersion) initReplicasIfEmpty() { + if m.Replicas == nil { + m.Replicas = make(map[int32]*ReplicaStatus) + } +} + func (m *ModelVersion) GetModelReplicaState(replicaIdx int) ModelReplicaState { + m.initReplicasIfEmpty() state, ok := m.Replicas[int32(replicaIdx)] if !ok { - return ModelReplicaState_MODEL_REPLICA_STATE_UNKNOWN + return ModelReplicaState_ModelReplicaStateUnknown } return state.State } @@ -235,7 +252,7 @@ func (m *Model) Inactive() bool { func (m *Model) getLastAvailableModelVersionIdx() int { lastAvailableIdx := -1 for idx, mv := range m.Versions { - if mv.State.State == ModelState_MODEL_STATE_AVAILABLE { + if mv.State.State == ModelState_ModelAvailable { lastAvailableIdx = idx } } @@ -264,38 +281,38 @@ func (m *ModelVersion) DeleteReplica(replicaIdx int) { } func (m ModelReplicaState) CanReceiveTraffic() bool { - return m == ModelReplicaState_MODEL_REPLICA_STATE_LOADED || - m == ModelReplicaState_MODEL_REPLICA_STATE_AVAILABLE || - m == ModelReplicaState_MODEL_REPLICA_STATE_LOADED_UNAVAILABLE || - m == ModelReplicaState_MODEL_REPLICA_STATE_DRAINING + return m == ModelReplicaState_Loaded || + m == ModelReplicaState_Available || + m == ModelReplicaState_LoadedUnavailable || + m == ModelReplicaState_Draining } func (m ModelReplicaState) AlreadyLoadingOrLoaded() bool { - return m == ModelReplicaState_MODEL_REPLICA_STATE_LOADING || - m == ModelReplicaState_MODEL_REPLICA_STATE_LOADED || - m == ModelReplicaState_MODEL_REPLICA_STATE_AVAILABLE || - m == ModelReplicaState_MODEL_REPLICA_STATE_LOADED_UNAVAILABLE + return m == ModelReplicaState_Loading || + m == ModelReplicaState_Loaded || + m == ModelReplicaState_Available || + m == ModelReplicaState_LoadedUnavailable } func (m ModelReplicaState) UnloadingOrUnloaded() bool { - return m == ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_ENVOY_REQUESTED || - m == ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_REQUESTED || - m == ModelReplicaState_MODEL_REPLICA_STATE_LOADING || - m == ModelReplicaState_MODEL_REPLICA_STATE_UNLOADED || - m == ModelReplicaState_MODEL_REPLICA_STATE_UNKNOWN + return m == ModelReplicaState_UnloadEnvoyRequested || + m == ModelReplicaState_UnloadRequested || + m == ModelReplicaState_Loading || + m == ModelReplicaState_Unloaded || + m == ModelReplicaState_ModelReplicaStateUnknown } func (m ModelReplicaState) Inactive() bool { - return m == ModelReplicaState_MODEL_REPLICA_STATE_UNLOADED || - m == ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_FAILED || - m == ModelReplicaState_MODEL_REPLICA_STATE_UNKNOWN || - m == ModelReplicaState_MODEL_REPLICA_STATE_LOAD_FAILED + return m == ModelReplicaState_Unloaded || + m == ModelReplicaState_UnloadFailed || + m == ModelReplicaState_ModelReplicaStateUnknown || + m == ModelReplicaState_LoadFailed } func (m ModelReplicaState) IsLoadingOrLoaded() bool { - return m == ModelReplicaState_MODEL_REPLICA_STATE_LOADED || - m == ModelReplicaState_MODEL_REPLICA_STATE_LOAD_REQUESTED || - m == ModelReplicaState_MODEL_REPLICA_STATE_LOADING || - m == ModelReplicaState_MODEL_REPLICA_STATE_AVAILABLE || - m == ModelReplicaState_MODEL_REPLICA_STATE_LOADED_UNAVAILABLE + return m == ModelReplicaState_Loaded || + m == ModelReplicaState_LoadRequested || + m == ModelReplicaState_Loading || + m == ModelReplicaState_Available || + m == ModelReplicaState_LoadedUnavailable } diff --git a/apis/go/mlops/scheduler/db/server_ext.go b/apis/go/mlops/scheduler/db/server_ext.go index 0e9854a940..a2ebf6d2be 100644 --- a/apis/go/mlops/scheduler/db/server_ext.go +++ b/apis/go/mlops/scheduler/db/server_ext.go @@ -1,9 +1,29 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed BY +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + package db import ( "slices" ) +func (s *Server) initReplicas() { + if s.Replicas == nil { + s.Replicas = map[int32]*ServerReplica{} + } +} + +func (s *Server) AddReplica(replicaID int32, replica *ServerReplica) { + s.initReplicas() + s.Replicas[replicaID] = replica +} + func (s *ServerReplica) GetLoadedOrLoadingModelVersions() []*ModelVersionID { var models []*ModelVersionID models = append(models, s.LoadedModels...) @@ -27,20 +47,27 @@ func (s *ServerReplica) GetNumLoadedModels() int { return len(s.UniqueLoadedModels) } +func (s *ServerReplica) initUniqueLoadedModels() { + if s.UniqueLoadedModels == nil { + s.UniqueLoadedModels = map[string]bool{} + } +} + func (s *ServerReplica) AddModelVersion(modelName string, modelVersion uint32, replicaState ModelReplicaState) { mvID := &ModelVersionID{ Name: modelName, Version: modelVersion, } - if replicaState == ModelReplicaState_MODEL_REPLICA_STATE_LOADING { + if replicaState == ModelReplicaState_Loading { s.addToList(&s.LoadingModels, mvID) return } - if replicaState == ModelReplicaState_MODEL_REPLICA_STATE_LOADED { + if replicaState == ModelReplicaState_Loaded { s.removeFromList(&s.LoadingModels, mvID) s.addToList(&s.LoadedModels, mvID) + s.initUniqueLoadedModels() s.UniqueLoadedModels[modelName] = true } } @@ -55,6 +82,9 @@ func (s *ServerReplica) DeleteModelVersion(modelName string, modelVersion uint32 s.removeFromList(&s.LoadedModels, mvID) if !s.modelExistsInList(s.LoadedModels, modelName) { + if s.UniqueLoadedModels == nil { + return + } delete(s.UniqueLoadedModels, modelName) } } diff --git a/apis/go/mlops/scheduler/scheduler.pb.go b/apis/go/mlops/scheduler/scheduler.pb.go index 5e21a2dbb8..1461692bc2 100644 --- a/apis/go/mlops/scheduler/scheduler.pb.go +++ b/apis/go/mlops/scheduler/scheduler.pb.go @@ -1,3 +1,12 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed BY +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.34.2 diff --git a/apis/go/mlops/scheduler/scheduler_grpc.pb.go b/apis/go/mlops/scheduler/scheduler_grpc.pb.go index d4e6a770e7..2f58ce8c17 100644 --- a/apis/go/mlops/scheduler/scheduler_grpc.pb.go +++ b/apis/go/mlops/scheduler/scheduler_grpc.pb.go @@ -1,3 +1,12 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed BY +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + // Code generated by protoc-gen-go-grpc. DO NOT EDIT. // versions: // - protoc-gen-go-grpc v1.4.0 diff --git a/apis/mlops/scheduler/db/db.proto b/apis/mlops/scheduler/db/db.proto index fbfc776bd6..90f6594417 100644 --- a/apis/mlops/scheduler/db/db.proto +++ b/apis/mlops/scheduler/db/db.proto @@ -32,34 +32,34 @@ message ExperimentSnapshot { // ModelState represents the state of a model enum ModelState { - MODEL_STATE_UNKNOWN = 0; - MODEL_STATE_PROGRESSING = 1; - MODEL_STATE_AVAILABLE = 2; - MODEL_STATE_FAILED = 3; - MODEL_STATE_TERMINATING = 4; - MODEL_STATE_TERMINATED = 5; - MODEL_STATE_TERMINATE_FAILED = 6; - MODEL_STATE_SCHEDULE_FAILED = 7; - MODEL_STATE_SCALED_DOWN = 8; - MODEL_STATE_CREATE = 9; - MODEL_STATE_TERMINATE = 10; + ModelStateUnknown = 0; + ModelProgressing = 1; + ModelAvailable = 2; + ModelFailed = 3; + ModelTerminating = 4; + ModelTerminated = 5; + ModelTerminateFailed = 6; + ScheduleFailed = 7; + ModelScaledDown = 8; + ModelCreate = 9; + ModelTerminate = 10; } // ModelReplicaState represents the state of a model replica enum ModelReplicaState { - MODEL_REPLICA_STATE_UNKNOWN = 0; - MODEL_REPLICA_STATE_LOAD_REQUESTED = 1; - MODEL_REPLICA_STATE_LOADING = 2; - MODEL_REPLICA_STATE_LOADED = 3; - MODEL_REPLICA_STATE_LOAD_FAILED = 4; - MODEL_REPLICA_STATE_UNLOAD_ENVOY_REQUESTED = 5; - MODEL_REPLICA_STATE_UNLOAD_REQUESTED = 6; - MODEL_REPLICA_STATE_UNLOADING = 7; - MODEL_REPLICA_STATE_UNLOADED = 8; - MODEL_REPLICA_STATE_UNLOAD_FAILED = 9; - MODEL_REPLICA_STATE_AVAILABLE = 10; - MODEL_REPLICA_STATE_LOADED_UNAVAILABLE = 11; - MODEL_REPLICA_STATE_DRAINING = 12; + ModelReplicaStateUnknown = 0; + LoadRequested = 1; + Loading = 2; + Loaded = 3; + LoadFailed = 4; + UnloadRequested = 5; + Unloading = 6; + Unloaded = 7; + UnloadFailed = 8; + Available = 9; + LoadedUnavailable = 10; + UnloadEnvoyRequested = 11; + Draining = 12; } // ReplicaStatus represents the status of a replica diff --git a/scheduler/pkg/agent/server.go b/scheduler/pkg/agent/server.go index 2d6c76c0e2..332b390f03 100644 --- a/scheduler/pkg/agent/server.go +++ b/scheduler/pkg/agent/server.go @@ -11,6 +11,7 @@ package agent import ( "context" + "errors" "fmt" "io" "net" @@ -259,7 +260,7 @@ func (s *Server) Sync(modelName string) { defer s.store.UnlockModel(modelName) model, err := s.store.GetModel(modelName) - if err != nil { + if err != nil && !errors.Is(err, store.ErrNotFound) { logger.WithError(err).Error("Sync failed") return } @@ -273,7 +274,7 @@ func (s *Server) Sync(modelName string) { // we signal a model when other replica is available in case we have servers draining // TODO: extract as helper func if latestModel != nil { - available := latestModel.GetReplicaForState(db.ModelReplicaState_MODEL_REPLICA_STATE_AVAILABLE) + available := latestModel.GetReplicaForState(db.ModelReplicaState_Available) if len(available) > 0 { s.waiter.signalModel(modelName) } @@ -281,7 +282,7 @@ func (s *Server) Sync(modelName string) { // Handle any load requests for latest version - we don't want to load models from older versions if latestModel != nil { - for _, replicaIdx := range latestModel.GetReplicaForState(db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_REQUESTED) { + for _, replicaIdx := range latestModel.GetReplicaForState(db.ModelReplicaState_LoadRequested) { logger.Infof("Sending load model request for %s", modelName) as, ok := s.agents[ServerKey{serverName: latestModel.Server, replicaIdx: uint32(replicaIdx)}] @@ -302,14 +303,14 @@ func (s *Server) Sync(modelName string) { logger.WithError(err).Errorf("stream message send failed for model %s and replicaidx %d", modelName, replicaIdx) if errState := s.store.UpdateModelState( latestModel.ModelName(), latestModel.GetVersion(), latestModel.Server, replicaIdx, nil, - db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_REQUESTED, db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_FAILED, err.Error(), nil); errState != nil { + db.ModelReplicaState_LoadRequested, db.ModelReplicaState_LoadFailed, err.Error(), nil); errState != nil { logger.WithError(errState).Errorf("Sync set model state failed for model %s replicaidx %d", modelName, replicaIdx) } continue } err = s.store.UpdateModelState(latestModel.ModelName(), latestModel.GetVersion(), - latestModel.Server, replicaIdx, nil, db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_REQUESTED, - db.ModelReplicaState_MODEL_REPLICA_STATE_LOADING, "", nil) + latestModel.Server, replicaIdx, nil, db.ModelReplicaState_LoadRequested, + db.ModelReplicaState_Loading, "", nil) if err != nil { logger.WithError(err).Errorf("Sync set model state failed for model %s replicaidx %d", modelName, replicaIdx) continue @@ -319,7 +320,7 @@ func (s *Server) Sync(modelName string) { // Loop through all versions and unload any requested - any version of a model might have an unload request for _, modelVersion := range model.Versions { - for _, replicaIdx := range modelVersion.GetReplicaForState(db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_REQUESTED) { + for _, replicaIdx := range modelVersion.GetReplicaForState(db.ModelReplicaState_UnloadRequested) { s.logger.Infof("Sending unload model request for %s:%d", modelName, modelVersion.GetVersion()) as, ok := s.agents[ServerKey{serverName: modelVersion.Server, replicaIdx: uint32(replicaIdx)}] if !ok { @@ -336,13 +337,13 @@ func (s *Server) Sync(modelName string) { logger.WithError(err).Errorf("stream message send failed for model %s and replicaidx %d", modelName, replicaIdx) if errState := s.store.UpdateModelState( latestModel.ModelName(), latestModel.GetVersion(), latestModel.Server, replicaIdx, nil, - db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_REQUESTED, db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_FAILED, err.Error(), nil); errState != nil { + db.ModelReplicaState_UnloadRequested, db.ModelReplicaState_UnloadFailed, err.Error(), nil); errState != nil { logger.WithError(errState).Errorf("Sync set model state failed for model %s replicaidx %d", modelName, replicaIdx) } continue } err = s.store.UpdateModelState(modelVersion.ModelName(), modelVersion.GetVersion(), modelVersion.Server, - replicaIdx, nil, db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_REQUESTED, db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOADING, "", nil) + replicaIdx, nil, db.ModelReplicaState_UnloadRequested, db.ModelReplicaState_Unloading, "", nil) if err != nil { logger.WithError(err).Errorf("Sync set model state failed for model %s replicaidx %d", modelName, replicaIdx) continue @@ -364,20 +365,20 @@ func (s *Server) AgentEvent(ctx context.Context, message *pb.ModelEventMessage) var expectedState db.ModelReplicaState switch message.Event { case pb.ModelEventMessage_LOADED: - expectedState = db.ModelReplicaState_MODEL_REPLICA_STATE_LOADING - desiredState = db.ModelReplicaState_MODEL_REPLICA_STATE_LOADED + expectedState = db.ModelReplicaState_Loading + desiredState = db.ModelReplicaState_Loaded case pb.ModelEventMessage_UNLOADED: - expectedState = db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOADING - desiredState = db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOADED + expectedState = db.ModelReplicaState_Unloading + desiredState = db.ModelReplicaState_Unloaded case pb.ModelEventMessage_LOAD_FAILED, pb.ModelEventMessage_LOAD_FAIL_MEMORY: - expectedState = db.ModelReplicaState_MODEL_REPLICA_STATE_LOADING - desiredState = db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_FAILED + expectedState = db.ModelReplicaState_Loading + desiredState = db.ModelReplicaState_LoadFailed case pb.ModelEventMessage_UNLOAD_FAILED: - expectedState = db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOADING - desiredState = db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_FAILED + expectedState = db.ModelReplicaState_Unloading + desiredState = db.ModelReplicaState_UnloadFailed default: - desiredState = db.ModelReplicaState_MODEL_REPLICA_STATE_UNKNOWN + desiredState = db.ModelReplicaState_ModelReplicaStateUnknown } logger.Infof("Updating state for model %s to %s", message.ModelName, desiredState.String()) s.store.LockModel(message.ModelName) @@ -558,11 +559,11 @@ func (s *Server) drainServerReplicaImpl(serverName string, serverReplicaIdx int) func (s *Server) applyModelScaling(message *pb.ModelScalingTriggerMessage) error { modelName := message.ModelName model, err := s.store.GetModel(modelName) - if err != nil { + if err != nil && !errors.Is(err, store.ErrNotFound) { return err } if model == nil { - return fmt.Errorf("Model %s not found", modelName) + return fmt.Errorf("model %s not found", modelName) } modelProto, err := createScalingPseudoRequest(message, model) diff --git a/scheduler/pkg/envoy/processor/incremental.go b/scheduler/pkg/envoy/processor/incremental.go index 58babf71f4..7130506e79 100644 --- a/scheduler/pkg/envoy/processor/incremental.go +++ b/scheduler/pkg/envoy/processor/incremental.go @@ -10,6 +10,7 @@ the Change License after the Change Date as each is defined in accordance with t package processor import ( + "errors" "fmt" "strconv" "sync" @@ -510,7 +511,7 @@ func (p *IncrementalProcessor) modelUpdate(modelName string) error { logger.Debugf("Calling model update for %s", modelName) model, err := p.modelStore.GetModel(modelName) - if err != nil { + if err != nil && !errors.Is(err, store.ErrNotFound) { logger.WithError(err).Warnf("sync: Failed to sync model %s", modelName) if err := p.removeRouteForServerInEnvoyCache(modelName); err != nil { logger.WithError(err).Errorf("Failed to remove model route from envoy %s", modelName) @@ -646,10 +647,10 @@ func (p *IncrementalProcessor) modelSync() { logger.Debugf("Calling model sync") envoyErr := p.updateEnvoy() - serverReplicaState := db.ModelReplicaState_MODEL_REPLICA_STATE_AVAILABLE + serverReplicaState := db.ModelReplicaState_Available reason := "" if envoyErr != nil { - serverReplicaState = db.ModelReplicaState_MODEL_REPLICA_STATE_LOADED_UNAVAILABLE + serverReplicaState = db.ModelReplicaState_LoadedUnavailable reason = envoyErr.Error() } @@ -682,7 +683,7 @@ func (p *IncrementalProcessor) modelSync() { for _, replicaIdx := range v.GetAssignment() { serverReplicaExpectedState := vs[replicaIdx].State // Ignore draining nodes to be changed to Available/Failed state - if serverReplicaExpectedState != db.ModelReplicaState_MODEL_REPLICA_STATE_DRAINING { + if serverReplicaExpectedState != db.ModelReplicaState_Draining { err2 := p.modelStore.UpdateModelState( mv.name, v.GetVersion(), @@ -706,7 +707,7 @@ func (p *IncrementalProcessor) modelSync() { } // Go through all replicas that we have set to UnloadEnvoyRequested and mark them as UnloadRequested // to resume the unload process from servers - for _, replicaIdx := range v.GetReplicaForState(db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_ENVOY_REQUESTED) { + for _, replicaIdx := range v.GetReplicaForState(db.ModelReplicaState_UnloadEnvoyRequested) { serverReplicaExpectedState := vs[replicaIdx].State if err := p.modelStore.UpdateModelState( mv.name, @@ -715,7 +716,7 @@ func (p *IncrementalProcessor) modelSync() { replicaIdx, nil, serverReplicaExpectedState, - db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_ENVOY_REQUESTED, + db.ModelReplicaState_UnloadEnvoyRequested, "", nil, ); err != nil { diff --git a/scheduler/pkg/kafka/conflict-resolution/conflict_resolution.go b/scheduler/pkg/kafka/conflict-resolution/conflict_resolution.go index 3e010e71b5..d00f66e2bd 100644 --- a/scheduler/pkg/kafka/conflict-resolution/conflict_resolution.go +++ b/scheduler/pkg/kafka/conflict-resolution/conflict_resolution.go @@ -216,7 +216,7 @@ func CreateNewModelIteration( modelName string, servers []string, ) { - cr.CreateNewIteration(modelName, servers, db.ModelState_MODEL_STATE_UNKNOWN) + cr.CreateNewIteration(modelName, servers, db.ModelState_ModelStateUnknown) } func GetModelStatus( @@ -228,27 +228,27 @@ func GetModelStatus( streams := cr.VectorResponseStatus[modelName] var messageStr = "" - readyCount := cr.GetCountWithStatus(modelName, db.ModelState_MODEL_STATE_AVAILABLE) + readyCount := cr.GetCountWithStatus(modelName, db.ModelState_ModelAvailable) if readyCount > 0 { messageStr += fmt.Sprintf("%d/%d ready ", readyCount, len(streams)) } - terminatedCount := cr.GetCountWithStatus(modelName, db.ModelState_MODEL_STATE_TERMINATED) + terminatedCount := cr.GetCountWithStatus(modelName, db.ModelState_ModelTerminated) if terminatedCount > 0 { messageStr += fmt.Sprintf("%d/%d terminated ", terminatedCount, len(streams)) } - failedCount := cr.GetCountWithStatus(modelName, db.ModelState_MODEL_STATE_FAILED) + failedCount := cr.GetCountWithStatus(modelName, db.ModelState_ModelFailed) if failedCount > 0 { messageStr += fmt.Sprintf("%d/%d failed ", failedCount, len(streams)) } - terminatedFailedCount := cr.GetCountWithStatus(modelName, db.ModelState_MODEL_STATE_TERMINATE_FAILED) + terminatedFailedCount := cr.GetCountWithStatus(modelName, db.ModelState_ModelTerminateFailed) if terminatedFailedCount > 0 { messageStr += fmt.Sprintf("%d/%d terminate failed ", terminatedFailedCount, len(streams)) } - unknownCount := cr.GetCountWithStatus(modelName, db.ModelState_MODEL_STATE_UNKNOWN) + unknownCount := cr.GetCountWithStatus(modelName, db.ModelState_ModelStateUnknown) logger.Infof("Model %s status counts: %s", modelName, messageStr) if message.Update.Op == pb.ModelUpdateMessage_Create { @@ -258,25 +258,25 @@ func GetModelStatus( // TODO: Implement something similar to models to display the numbers // of available replicas if failedCount == len(streams) { - return db.ModelState_MODEL_STATE_FAILED, messageStr + return db.ModelState_ModelFailed, messageStr } if readyCount > 0 && unknownCount == 0 { - return db.ModelState_MODEL_STATE_AVAILABLE, messageStr + return db.ModelState_ModelAvailable, messageStr } - return db.ModelState_MODEL_STATE_PROGRESSING, messageStr + return db.ModelState_ModelProgressing, messageStr } if message.Update.Op == pb.ModelUpdateMessage_Delete { if failedCount > 0 { - return db.ModelState_MODEL_STATE_TERMINATE_FAILED, messageStr + return db.ModelState_ModelTerminateFailed, messageStr } if terminatedCount == len(streams) { - return db.ModelState_MODEL_STATE_TERMINATED, messageStr + return db.ModelState_ModelTerminated, messageStr } - return db.ModelState_MODEL_STATE_TERMINATING, messageStr + return db.ModelState_ModelTerminating, messageStr } - return db.ModelState_MODEL_STATE_UNKNOWN, "Unknown operation or status" + return db.ModelState_ModelStateUnknown, "Unknown operation or status" } func IsModelMessageOutdated( diff --git a/scheduler/pkg/scheduler/cleaner/versions.go b/scheduler/pkg/scheduler/cleaner/versions.go index 2a2c97d29e..14ee3ad2eb 100644 --- a/scheduler/pkg/scheduler/cleaner/versions.go +++ b/scheduler/pkg/scheduler/cleaner/versions.go @@ -59,14 +59,11 @@ func (v *VersionCleaner) cleanupOldVersions(modelName string) error { if err != nil { return err } - if model == nil { - return fmt.Errorf("can't find model with key %s", modelName) - } latest := model.Latest() if latest == nil { return fmt.Errorf("failed to find latest model for %s", modelName) } - if latest.State.State == db.ModelState_MODEL_STATE_AVAILABLE { + if latest.State.State == db.ModelState_ModelAvailable { for _, mv := range model.GetVersionsBeforeLastAvailable() { _, err := v.store.UnloadVersionModels(modelName, mv.GetVersion()) if err != nil { @@ -74,7 +71,7 @@ func (v *VersionCleaner) cleanupOldVersions(modelName string) error { } } } - if latest.State.ModelGwState == db.ModelState_MODEL_STATE_AVAILABLE { + if latest.State.ModelGwState == db.ModelState_ModelAvailable { for _, mv := range model.GetVersionsBeforeLastModelGwAvailable() { _, err := v.store.UnloadModelGwVersionModels(modelName, mv.GetVersion()) if err != nil { diff --git a/scheduler/pkg/scheduler/scheduler.go b/scheduler/pkg/scheduler/scheduler.go index 63d6feb476..23d31a6008 100644 --- a/scheduler/pkg/scheduler/scheduler.go +++ b/scheduler/pkg/scheduler/scheduler.go @@ -128,8 +128,8 @@ func (s *SimpleScheduler) getFailedModels() ([]string, error) { version := model.Latest() if version != nil { versionState := version.State - if versionState.State == db.ModelState_MODEL_STATE_FAILED || versionState.State == db.ModelState_MODEL_STATE_SCHEDULE_FAILED || - ((versionState.State == db.ModelState_MODEL_STATE_AVAILABLE || versionState.State == db.ModelState_MODEL_STATE_PROGRESSING) && + if versionState.State == db.ModelState_ModelFailed || versionState.State == db.ModelState_ScheduleFailed || + ((versionState.State == db.ModelState_ModelAvailable || versionState.State == db.ModelState_ModelProgressing) && versionState.AvailableReplicas < version.ModelDefn.DeploymentSpec.GetReplicas()) { failedModels = append(failedModels, model.Name) } @@ -149,7 +149,7 @@ func (s *SimpleScheduler) scheduleToServer(modelName string) (*coordinator.Serve // Get Model model, err := s.store.GetModel(modelName) - if err != nil { + if err != nil && !errors.Is(err, store.ErrNotFound) { return nil, err } if model == nil { diff --git a/scheduler/pkg/server/server_status.go b/scheduler/pkg/server/server_status.go index 89c1276f5c..eaa3a2acdb 100644 --- a/scheduler/pkg/server/server_status.go +++ b/scheduler/pkg/server/server_status.go @@ -31,13 +31,13 @@ const ( // pollerRetryFailedCreateModels will retry creating models on model-gw which failed to load. Most likely // due to connectivity issues with kafka. func (s *SchedulerServer) pollerRetryFailedCreateModels(ctx context.Context, tick time.Duration, maxRetry uint) { - s.pollerRetryFailedModels(ctx, tick, "pollerRetryFailedCreateModels", db.ModelState_MODEL_STATE_FAILED, "create", maxRetry) + s.pollerRetryFailedModels(ctx, tick, "pollerRetryFailedCreateModels", db.ModelState_ModelFailed, "create", maxRetry) } // pollerRetryFailedDeleteModels will retry deleting models on model-gw which failed to terminate. Most likely // due to connectivity issues with kafka. func (s *SchedulerServer) pollerRetryFailedDeleteModels(ctx context.Context, tick time.Duration, maxRetry uint) { - s.pollerRetryFailedModels(ctx, tick, "pollerRetryFailedDeleteModels", db.ModelState_MODEL_STATE_TERMINATE_FAILED, "delete", maxRetry) + s.pollerRetryFailedModels(ctx, tick, "pollerRetryFailedDeleteModels", db.ModelState_ModelTerminateFailed, "delete", maxRetry) } func (s *SchedulerServer) pollerRetryFailedModels(ctx context.Context, tick time.Duration, funcName string, targetState db.ModelState, operation string, maxRetry uint) { @@ -133,16 +133,16 @@ func (s *SchedulerServer) ModelStatusEvent(_ context.Context, message *pb.ModelU case pb.ModelUpdateMessage_Create: if message.Success { s.resetModelRetryCount(message.Update) - statusVal = db.ModelState_MODEL_STATE_AVAILABLE + statusVal = db.ModelState_ModelAvailable } else { - statusVal = db.ModelState_MODEL_STATE_FAILED + statusVal = db.ModelState_ModelFailed } case pb.ModelUpdateMessage_Delete: if message.Success { s.removeModelRetryCount(message.Update) - statusVal = db.ModelState_MODEL_STATE_TERMINATED + statusVal = db.ModelState_ModelTerminated } else { - statusVal = db.ModelState_MODEL_STATE_TERMINATE_FAILED + statusVal = db.ModelState_ModelTerminateFailed } } @@ -162,7 +162,7 @@ func (s *SchedulerServer) ModelStatusEvent(_ context.Context, message *pb.ModelU confRes.UpdateStatus(modelName, stream, statusVal) modelStatusVal, reason := cr.GetModelStatus(confRes, modelName, message) - if modelStatusVal == db.ModelState_MODEL_STATE_TERMINATED { + if modelStatusVal == db.ModelState_ModelTerminated { confRes.Delete(modelName) } @@ -285,12 +285,12 @@ func (s *SchedulerServer) allPermittedModels() ([]*db.Model, error) { } allowedModelGwStates := map[db.ModelState]struct{}{ - db.ModelState_MODEL_STATE_CREATE: {}, - db.ModelState_MODEL_STATE_PROGRESSING: {}, - db.ModelState_MODEL_STATE_AVAILABLE: {}, - db.ModelState_MODEL_STATE_TERMINATING: {}, + db.ModelState_ModelCreate: {}, + db.ModelState_ModelProgressing: {}, + db.ModelState_ModelAvailable: {}, + db.ModelState_ModelTerminating: {}, // we want to retry models which failed to create on model-gw i.e. likely kafka connectivity issues - db.ModelState_MODEL_STATE_FAILED: {}, + db.ModelState_ModelFailed: {}, } for _, modelName := range modelNames { @@ -363,10 +363,10 @@ func (s *SchedulerServer) modelGwRebalanceForModels(models []*db.Model) { } func (s *SchedulerServer) modelGwRebalanceNoStream(model *db.Model) { - modelState := db.ModelState_MODEL_STATE_CREATE - if model.Latest().State.ModelGwState == db.ModelState_MODEL_STATE_TERMINATING || - model.Latest().State.ModelGwState == db.ModelState_MODEL_STATE_TERMINATE_FAILED { - modelState = db.ModelState_MODEL_STATE_TERMINATED + modelState := db.ModelState_ModelCreate + if model.Latest().State.ModelGwState == db.ModelState_ModelTerminating || + model.Latest().State.ModelGwState == db.ModelState_ModelTerminateFailed { + modelState = db.ModelState_ModelTerminated } s.logger.Debugf( @@ -412,7 +412,7 @@ func (s *SchedulerServer) modelGwReblanceStreams(model *db.Model) { var msg *pb.ModelStatusResponse var err error - if state == db.ModelState_MODEL_STATE_TERMINATING || state == db.ModelState_MODEL_STATE_TERMINATE_FAILED { + if state == db.ModelState_ModelTerminating || state == db.ModelState_ModelTerminateFailed { s.logger.Debugf("Model %s in state %s, sending deletion message", model.Name, state) msg, err = s.createModelDeletionMessage(model, false) } else { @@ -423,7 +423,7 @@ func (s *SchedulerServer) modelGwReblanceStreams(model *db.Model) { if err := s.modelStore.SetModelGwModelState( model.Name, model.Latest().GetVersion(), - db.ModelState_MODEL_STATE_PROGRESSING, + db.ModelState_ModelProgressing, "Rebalance", modelStatusEventSource, ); err != nil { @@ -564,14 +564,14 @@ func (s *SchedulerServer) sendModelStatusEvent(evt coordinator.ModelEventMsg) er } modelState := model.Latest().State - if len(modelGwStreams) == 0 && modelState.ModelGwState != db.ModelState_MODEL_STATE_TERMINATED { + if len(modelGwStreams) == 0 && modelState.ModelGwState != db.ModelState_ModelTerminated { // handle case where we don't have any model-gateway streams errMsg := "No model gateway available to handle model" logger.WithField("model", model.Name).Warn(errMsg) modelGwState := modelState.ModelGwState - if modelState.ModelGwState == db.ModelState_MODEL_STATE_TERMINATE || modelState.ModelGwState == db.ModelState_MODEL_STATE_TERMINATING { - modelGwState = db.ModelState_MODEL_STATE_TERMINATED + if modelState.ModelGwState == db.ModelState_ModelTerminate || modelState.ModelGwState == db.ModelState_ModelTerminating { + modelGwState = db.ModelState_ModelTerminated } if err := s.modelStore.SetModelGwModelState( @@ -591,12 +591,12 @@ func (s *SchedulerServer) sendModelStatusEvent(evt coordinator.ModelEventMsg) er } switch modelState.ModelGwState { - case db.ModelState_MODEL_STATE_CREATE: + case db.ModelState_ModelCreate: logger.Debugf("Model %s is in create state, sending creation message", model.Name) if err := s.modelStore.SetModelGwModelState( model.Name, model.Latest().GetVersion(), - db.ModelState_MODEL_STATE_PROGRESSING, + db.ModelState_ModelProgressing, "Model is being loaded onto model gateway", modelStatusEventSource, ); err != nil { @@ -614,12 +614,12 @@ func (s *SchedulerServer) sendModelStatusEvent(evt coordinator.ModelEventMsg) er // send message to model gateway streams s.sendModelStatusEventToStreamsWithTimestamp(evt, ms, modelGwStreams) - case db.ModelState_MODEL_STATE_TERMINATE: + case db.ModelState_ModelTerminate: logger.Debugf("Model %s is in terminate state, sending deletion message", model.Name) if err := s.modelStore.SetModelGwModelState( model.Name, model.Latest().GetVersion(), - db.ModelState_MODEL_STATE_TERMINATING, + db.ModelState_ModelTerminating, "Model is being unloaded from model gateway", modelStatusEventSource, ); err != nil { diff --git a/scheduler/pkg/store/experiment/state.go b/scheduler/pkg/store/experiment/state.go index d3c493203b..90b9b1919e 100644 --- a/scheduler/pkg/store/experiment/state.go +++ b/scheduler/pkg/store/experiment/state.go @@ -161,7 +161,7 @@ func (es *ExperimentStore) setCandidateAndMirrorReadiness(experiment *Experiment if err != nil { logger.WithError(err).Infof("Failed to get model %s for candidate check for experiment %s", candidate.Name, experiment.Name) } else { - if model.Latest() != nil && model.Latest().State.State == db.ModelState_MODEL_STATE_AVAILABLE { + if model.Latest() != nil && model.Latest().State.State == db.ModelState_ModelAvailable { candidate.Ready = true } else { candidate.Ready = false @@ -173,7 +173,7 @@ func (es *ExperimentStore) setCandidateAndMirrorReadiness(experiment *Experiment if err != nil { logger.WithError(err).Warnf("Failed to get model %s for mirror check for experiment %s", experiment.Mirror.Name, experiment.Name) } else { - if model.Latest() != nil && model.Latest().State.State == db.ModelState_MODEL_STATE_AVAILABLE { + if model.Latest() != nil && model.Latest().State.State == db.ModelState_ModelAvailable { experiment.Mirror.Ready = true } else { experiment.Mirror.Ready = false diff --git a/scheduler/pkg/store/experiment/store.go b/scheduler/pkg/store/experiment/store.go index 80dadf60c2..621886e328 100644 --- a/scheduler/pkg/store/experiment/store.go +++ b/scheduler/pkg/store/experiment/store.go @@ -195,7 +195,7 @@ func (es *ExperimentStore) handleModelEvents(event coordinator.ModelEventMsg) { if err != nil { logger.WithError(err).Warnf("Failed to get model %s for candidate check for experiment %s", event.ModelName, experiment.Name) } else { - if model.Latest() != nil && model.Latest().State.State == db.ModelState_MODEL_STATE_AVAILABLE { + if model.Latest() != nil && model.Latest().State.State == db.ModelState_ModelAvailable { candidate.Ready = true } else { candidate.Ready = false @@ -209,7 +209,7 @@ func (es *ExperimentStore) handleModelEvents(event coordinator.ModelEventMsg) { if err != nil { logger.WithError(err).Warnf("Failed to get model %s for mirror check for experiment %s", event.ModelName, experiment.Name) } else { - if model.Latest() != nil && model.Latest().State.State == db.ModelState_MODEL_STATE_AVAILABLE { + if model.Latest() != nil && model.Latest().State.State == db.ModelState_ModelAvailable { experiment.Mirror.Ready = true } else { experiment.Mirror.Ready = false diff --git a/scheduler/pkg/store/in-memory/store.go b/scheduler/pkg/store/in-memory/store.go index 0b06428b19..9edcbb9d25 100644 --- a/scheduler/pkg/store/in-memory/store.go +++ b/scheduler/pkg/store/in-memory/store.go @@ -1,3 +1,12 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed BY +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + package in_memory import ( @@ -24,7 +33,9 @@ func NewStorage[T interface { proto.Message GetName() string }]() *Storage[T] { - return &Storage[T]{} + return &Storage[T]{ + records: make(map[string]T), + } } func (s *Storage[T]) Get(id string) (T, error) { diff --git a/scheduler/pkg/store/memory.go b/scheduler/pkg/store/memory.go index a1e74f7f82..613f4f6abe 100644 --- a/scheduler/pkg/store/memory.go +++ b/scheduler/pkg/store/memory.go @@ -417,11 +417,11 @@ func (m *ModelServerStore) updateLoadedModelsImpl( for replicaIdx := range assignedReplicaIds { if existingState, ok := modelVersion.Replicas[replicaIdx]; !ok { logger.Debugf( - "Model %s version %d state %s on server %s replica %d does not exist yet and should be loaded", - modelKey, modelVersion.Version, existingState.State.String(), serverKey, replicaIdx, + "Model %s version %d on server %s replica %d does not exist yet and should be loaded", + modelKey, modelVersion.Version, serverKey, replicaIdx, ) - modelVersion.SetReplicaState(int(replicaIdx), db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_REQUESTED, "") - if err := m.updateReservedMemory(db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_REQUESTED, serverKey, + modelVersion.SetReplicaState(int(replicaIdx), db.ModelReplicaState_UnloadRequested, "") + if err := m.updateReservedMemory(db.ModelReplicaState_LoadRequested, serverKey, int(replicaIdx), modelVersion.GetRequiredMemory()); err != nil { return nil, fmt.Errorf("failed to update server %s replica %d: %w", serverKey, replicaIdx, err) } @@ -432,8 +432,8 @@ func (m *ModelServerStore) updateLoadedModelsImpl( modelKey, modelVersion.Version, existingState.State.String(), serverKey, replicaIdx, ) if !existingState.State.AlreadyLoadingOrLoaded() { - modelVersion.SetReplicaState(int(replicaIdx), db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_REQUESTED, "") - if err := m.updateReservedMemory(db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_REQUESTED, serverKey, + modelVersion.SetReplicaState(int(replicaIdx), db.ModelReplicaState_LoadRequested, "") + if err := m.updateReservedMemory(db.ModelReplicaState_LoadRequested, serverKey, int(replicaIdx), modelVersion.GetRequiredMemory()); err != nil { return nil, fmt.Errorf("failed to update server %s replica %d: %w", serverKey, replicaIdx, err) } @@ -449,8 +449,8 @@ func (m *ModelServerStore) updateLoadedModelsImpl( "Checking if replicaidx %d with state %s should be unloaded", replicaIdx, existingState.State.String(), ) - if !existingState.State.UnloadingOrUnloaded() && existingState.State != db.ModelReplicaState_MODEL_REPLICA_STATE_DRAINING { - modelVersion.SetReplicaState(int(replicaIdx), db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_ENVOY_REQUESTED, "") + if !existingState.State.UnloadingOrUnloaded() && existingState.State != db.ModelReplicaState_Draining { + modelVersion.SetReplicaState(int(replicaIdx), db.ModelReplicaState_UnloadEnvoyRequested, "") replicaStateUpdated = true } } @@ -466,9 +466,9 @@ func (m *ModelServerStore) updateLoadedModelsImpl( // in modelVersion.state.AvailableReplicas (we call updateModelStatus later) // TODO: the conditions here keep growing, refactor or consider a simpler check. - if replicaStateUpdated || modelVersion.State.State == db.ModelState_MODEL_STATE_SCHEDULE_FAILED || model.Deleted || - modelVersion.State.State == db.ModelState_MODEL_STATE_PROGRESSING || - (modelVersion.State.State == db.ModelState_MODEL_STATE_AVAILABLE && len(modelVersion.GetAssignment()) < modelVersion.DesiredReplicas()) { + if replicaStateUpdated || modelVersion.State.State == db.ModelState_ScheduleFailed || model.Deleted || + modelVersion.State.State == db.ModelState_ModelProgressing || + (modelVersion.State.State == db.ModelState_ModelAvailable && len(modelVersion.GetAssignment()) < modelVersion.DesiredReplicas()) { logger.Debugf("Updating model status for model %s server %s", modelKey, serverKey) modelVersion.Server = serverKey if err := m.updateModelStatus(true, model.Deleted, modelVersion, model.GetLastAvailableModelVersion(), model); err != nil { @@ -525,7 +525,7 @@ func (m *ModelServerStore) unloadVersionModelsImpl(modelKey string, version uint replicaIdx, existingState.State.String(), ) - modelVersion.SetReplicaState(int(replicaIdx), db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_REQUESTED, "") + modelVersion.SetReplicaState(int(replicaIdx), db.ModelReplicaState_UnloadRequested, "") updated = true continue } @@ -667,14 +667,14 @@ func (m *ModelServerStore) updateModelStateImpl( ) // Update models loaded onto replica for relevant state - if desiredState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOADED || - desiredState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOADING || - desiredState == db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOADED || - desiredState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_FAILED { + if desiredState == db.ModelReplicaState_Loaded || + desiredState == db.ModelReplicaState_Loading || + desiredState == db.ModelReplicaState_Unloaded || + desiredState == db.ModelReplicaState_LoadFailed { replica, ok := server.Replicas[int32(replicaIdx)] if ok { - if desiredState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOADED || desiredState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOADING { + if desiredState == db.ModelReplicaState_Loaded || desiredState == db.ModelReplicaState_Loading { logger.Infof( "Adding model %s(%d) to server %s replica %d list of loaded / loading models", modelKey, version, serverKey, replicaIdx, @@ -735,11 +735,11 @@ func (m *ModelServerStore) updateReservedMemory( replica, okReplica := server.Replicas[int32(replicaIdx)] update := false if okReplica { - if modelReplicaState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_REQUESTED { + if modelReplicaState == db.ModelReplicaState_LoadRequested { replica.UpdateReservedMemory(memBytes, true) update = true - } else if modelReplicaState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_FAILED || - modelReplicaState == db.ModelReplicaState_MODEL_REPLICA_STATE_LOADED { + } else if modelReplicaState == db.ModelReplicaState_LoadFailed || + modelReplicaState == db.ModelReplicaState_Loaded { replica.UpdateReservedMemory(memBytes, false) update = true } @@ -797,7 +797,8 @@ func (m *ModelServerStore) addServerReplicaImpl(request *agent.AgentSubscribeReq request.ReplicaConfig, request.AvailableMemoryBytes, ) - server.Replicas[int32(request.ReplicaIdx)] = serverReplica + + server.AddReplica(int32(request.ReplicaIdx), serverReplica) if err := m.store.servers.Update(server); err != nil { return nil, coordinator.ServerEventMsg{}, fmt.Errorf("failed to update server %s: %w", request.ServerName, err) @@ -809,7 +810,7 @@ func (m *ModelServerStore) addServerReplicaImpl(request *agent.AgentSubscribeReq if err != nil { return nil, coordinator.ServerEventMsg{}, err } - modelVersion.Replicas[int32(request.ReplicaIdx)] = &db.ReplicaStatus{State: db.ModelReplicaState_MODEL_REPLICA_STATE_LOADED} + modelVersion.Replicas[int32(request.ReplicaIdx)] = &db.ReplicaStatus{State: db.ModelReplicaState_Loaded} modelVersion.Server = request.ServerName if err := m.updateModelStatus(true, false, modelVersion, model.GetLastAvailableModelVersion(), model); err != nil { return nil, coordinator.ServerEventMsg{}, fmt.Errorf("failed to update model status: %w", err) @@ -903,7 +904,7 @@ func (m *ModelServerStore) removeModelfromServerReplica(models []*db.ModelVersio "Model %s is being deleted and server replica %d is disconnected, skipping", v.Name, replicaIdx, ) - modelVersion.SetReplicaState(replicaIdx, db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOADED, + modelVersion.SetReplicaState(replicaIdx, db.ModelReplicaState_Unloaded, "model is removed when server replica was removed") m.LockModel(v.Name) if err := m.updateModelStatus( @@ -985,7 +986,7 @@ func (m *ModelServerStore) findModelsToReSchedule(models []*db.ModelVersionID, r if err == nil { modelVersion := model.GetVersion(v.Version) if modelVersion != nil { - modelVersion.SetReplicaState(replicaIdx, db.ModelReplicaState_MODEL_REPLICA_STATE_DRAINING, "trigger to drain") + modelVersion.SetReplicaState(replicaIdx, db.ModelReplicaState_Draining, "trigger to drain") if err := m.store.models.Update(model); err != nil { return nil, fmt.Errorf("failed update model %s: %w", model.Name, err) } diff --git a/scheduler/pkg/store/memory_status.go b/scheduler/pkg/store/memory_status.go index 50d9304b4a..1fa61c0472 100644 --- a/scheduler/pkg/store/memory_status.go +++ b/scheduler/pkg/store/memory_status.go @@ -42,29 +42,29 @@ func calcModelVersionStatistics(modelVersion *db.ModelVersion, deleted bool) *mo for _, replicaState := range modelVersion.Replicas { switch replicaState.State { - case db.ModelReplicaState_MODEL_REPLICA_STATE_AVAILABLE: + case db.ModelReplicaState_Available: s.replicasAvailable++ - case db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_REQUESTED, - db.ModelReplicaState_MODEL_REPLICA_STATE_LOADING, db.ModelReplicaState_MODEL_REPLICA_STATE_LOADED: // unavailable but OK + case db.ModelReplicaState_LoadRequested, + db.ModelReplicaState_Loading, db.ModelReplicaState_Loaded: // unavailable but OK s.replicasLoading++ - case db.ModelReplicaState_MODEL_REPLICA_STATE_LOAD_FAILED, db.ModelReplicaState_MODEL_REPLICA_STATE_LOADED_UNAVAILABLE: // unavailable but not OK + case db.ModelReplicaState_LoadFailed, db.ModelReplicaState_LoadedUnavailable: // unavailable but not OK s.replicasLoadFailed++ if !deleted && replicaState.Timestamp.AsTime().After(s.lastFailedStateTime) { s.lastFailedStateTime = replicaState.Timestamp.AsTime() s.lastFailedReason = replicaState.Reason } - case db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_ENVOY_REQUESTED, - db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_REQUESTED, db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOADING: + case db.ModelReplicaState_UnloadEnvoyRequested, + db.ModelReplicaState_UnloadRequested, db.ModelReplicaState_Unloading: s.replicasUnloading++ - case db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOADED: + case db.ModelReplicaState_Unloaded: s.replicasUnloaded++ - case db.ModelReplicaState_MODEL_REPLICA_STATE_UNLOAD_FAILED: + case db.ModelReplicaState_UnloadFailed: s.replicasUnloadFailed++ if deleted && replicaState.Timestamp.AsTime().After(s.lastFailedStateTime) { s.lastFailedStateTime = replicaState.Timestamp.AsTime() s.lastFailedReason = replicaState.Reason } - case db.ModelReplicaState_MODEL_REPLICA_STATE_DRAINING: + case db.ModelReplicaState_Draining: s.replicasDraining++ } if replicaState.Timestamp.AsTime().After(s.latestTime) { @@ -81,31 +81,31 @@ func updateModelState(isLatest bool, modelVersion *db.ModelVersion, prevModelVer modelTimestamp := stats.latestTime if deleted || !isLatest { if stats.replicasUnloadFailed > 0 { - modelState = db.ModelState_MODEL_STATE_TERMINATED + modelState = db.ModelState_ModelTerminated modelReason = stats.lastFailedReason modelTimestamp = stats.lastFailedStateTime } else if stats.replicasUnloading > 0 || stats.replicasAvailable > 0 || stats.replicasLoading > 0 { - modelState = db.ModelState_MODEL_STATE_TERMINATING + modelState = db.ModelState_ModelTerminating } else { - modelState = db.ModelState_MODEL_STATE_TERMINATED + modelState = db.ModelState_ModelTerminated } } else { if stats.replicasLoadFailed > 0 { - modelState = db.ModelState_MODEL_STATE_FAILED + modelState = db.ModelState_ModelFailed modelReason = stats.lastFailedReason modelTimestamp = stats.lastFailedStateTime } else if modelVersion.ModelDefn.DeploymentSpec != nil && stats.replicasAvailable == 0 && modelVersion.ModelDefn.DeploymentSpec.Replicas == 0 && modelVersion.ModelDefn.DeploymentSpec.MinReplicas == 0 { - modelState = db.ModelState_MODEL_STATE_SCALED_DOWN + modelState = db.ModelState_ModelScaledDown } else if (modelVersion.ModelDefn.DeploymentSpec != nil && stats.replicasAvailable == modelVersion.ModelDefn.DeploymentSpec.Replicas) || // equal to desired replicas (modelVersion.ModelDefn.DeploymentSpec != nil && stats.replicasAvailable >= modelVersion.ModelDefn.DeploymentSpec.MinReplicas && modelVersion.ModelDefn.DeploymentSpec.MinReplicas > 0) || // min replicas is set and available replicas are greater than or equal to min replicas (stats.replicasAvailable > 0 && prevModelVersion != nil && modelVersion != prevModelVersion && - prevModelVersion.State.State == db.ModelState_MODEL_STATE_AVAILABLE) { - modelState = db.ModelState_MODEL_STATE_AVAILABLE + prevModelVersion.State.State == db.ModelState_ModelAvailable) { + modelState = db.ModelState_ModelAvailable } else { - modelState = db.ModelState_MODEL_STATE_PROGRESSING + modelState = db.ModelState_ModelProgressing } } @@ -138,7 +138,7 @@ func (m *ModelServerStore) FailedScheduling(modelID string, version uint32, reas // we use len of GetAssignment instead of .state.AvailableReplicas as it is more accurate in this context availableReplicas := uint32(len(modelVersion.GetAssignment())) modelVersion.State = &db.ModelStatus{ - State: db.ModelState_MODEL_STATE_SCHEDULE_FAILED, + State: db.ModelState_ScheduleFailed, ModelGwState: modelVersion.State.ModelGwState, Reason: reason, ModelGwReason: modelVersion.State.ModelGwReason, @@ -184,11 +184,11 @@ func (m *ModelServerStore) updateModelStatus(isLatest bool, deleted bool, modelV func (m *ModelServerStore) setModelGwStatusToTerminate(isLatest bool, modelVersion *db.ModelVersion) { if !isLatest { - modelVersion.State.ModelGwState = db.ModelState_MODEL_STATE_TERMINATED + modelVersion.State.ModelGwState = db.ModelState_ModelTerminated modelVersion.State.ModelGwReason = "Not latest version" return } - modelVersion.State.ModelGwState = db.ModelState_MODEL_STATE_TERMINATE + modelVersion.State.ModelGwState = db.ModelState_ModelTerminate modelVersion.State.ModelGwReason = "Model deleted" } diff --git a/scheduler/pkg/store/mesh.go b/scheduler/pkg/store/mesh.go index a71a6aa6fb..00ea70b97b 100644 --- a/scheduler/pkg/store/mesh.go +++ b/scheduler/pkg/store/mesh.go @@ -85,8 +85,8 @@ func NewDefaultModelVersion(model *pb.Model, version uint32) *db.ModelVersion { ModelDefn: model, Replicas: make(map[int32]*db.ReplicaStatus, 0), State: &db.ModelStatus{ - State: db.ModelState_MODEL_STATE_UNKNOWN, - ModelGwState: db.ModelState_MODEL_STATE_CREATE, + State: db.ModelState_ModelStateUnknown, + ModelGwState: db.ModelState_ModelCreate, }, } } diff --git a/scheduler/pkg/store/mock/store.go b/scheduler/pkg/store/mock/store.go index f73802a318..0e773d90aa 100644 --- a/scheduler/pkg/store/mock/store.go +++ b/scheduler/pkg/store/mock/store.go @@ -1,3 +1,12 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed BY +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + // Code generated by MockGen. DO NOT EDIT. // Source: ./api.go // diff --git a/scheduler/pkg/store/pipeline/status.go b/scheduler/pkg/store/pipeline/status.go index 3ef4488b4a..27045135de 100644 --- a/scheduler/pkg/store/pipeline/status.go +++ b/scheduler/pkg/store/pipeline/status.go @@ -75,7 +75,7 @@ func (ms *ModelStatusHandler) setPipelineModelsReady(pipelineVersion *PipelineVe step.Available = false if model != nil { lastAvailableModelVersion := model.GetLastAvailableModelVersion() - if lastAvailableModelVersion != nil && lastAvailableModelVersion.State.ModelGwState == db.ModelState_MODEL_STATE_AVAILABLE { + if lastAvailableModelVersion != nil && lastAvailableModelVersion.State.ModelGwState == db.ModelState_ModelAvailable { step.Available = true } } diff --git a/scheduler/pkg/store/pipeline/store.go b/scheduler/pkg/store/pipeline/store.go index b234a25aeb..8b88e76e83 100644 --- a/scheduler/pkg/store/pipeline/store.go +++ b/scheduler/pkg/store/pipeline/store.go @@ -612,7 +612,7 @@ func (ps *PipelineStore) handleModelEvents(event coordinator.ModelEventMsg) { ps.mu.Lock() modelVersion := model.GetLastAvailableModelVersion() - modelAvailable := model != nil && modelVersion != nil && modelVersion.State.ModelGwState == db.ModelState_MODEL_STATE_AVAILABLE + modelAvailable := model != nil && modelVersion != nil && modelVersion.State.ModelGwState == db.ModelState_ModelAvailable evts := updatePipelinesFromModelAvailability(refs, event.ModelName, modelAvailable, ps.pipelines, ps.logger) ps.mu.Unlock() From 3d8b9232f1a17b36ba1085cf3d568c38e48aa91c Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Mon, 5 Jan 2026 16:14:56 +0000 Subject: [PATCH 11/50] fix new model version numbers not being set --- scheduler/pkg/store/memory.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/scheduler/pkg/store/memory.go b/scheduler/pkg/store/memory.go index 613f4f6abe..db5d01ec28 100644 --- a/scheduler/pkg/store/memory.go +++ b/scheduler/pkg/store/memory.go @@ -180,7 +180,9 @@ func (m *ModelServerStore) UpdateModel(req *pb.LoadModelRequest) error { modelName, ) latestModel.ModelDefn.DeploymentSpec = req.GetModel().GetDeploymentSpec() - case meq.MetaDiffers: + } + + if meq.MetaDiffers { // Update just kubernetes meta latestModel.ModelDefn.Meta.KubernetesMeta = req.GetModel().GetMeta().GetKubernetesMeta() } @@ -699,6 +701,10 @@ func (m *ModelServerStore) updateModelStateImpl( return nil, fmt.Errorf("update model status failed: %w", err) } + if err := m.store.servers.Update(server); err != nil { + return nil, fmt.Errorf("failed to update server %s: %w", serverKey, err) + } + modelEvt := &coordinator.ModelEventMsg{ ModelName: modelVersion.ModelDefn.Meta.Name, ModelVersion: modelVersion.GetVersion(), From 3d80eb8e260fe7f76ad2b5b9c04be87d46a800bb Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Mon, 5 Jan 2026 17:33:32 +0000 Subject: [PATCH 12/50] add context for future DB calls --- scheduler/pkg/store/in-memory/store.go | 11 +-- scheduler/pkg/store/memory.go | 89 +++++++++++++------------ scheduler/pkg/store/memory_status.go | 11 +-- scheduler/pkg/store/storage.go | 12 ++-- scheduler/pkg/store/test_memory_hack.go | 5 +- 5 files changed, 67 insertions(+), 61 deletions(-) diff --git a/scheduler/pkg/store/in-memory/store.go b/scheduler/pkg/store/in-memory/store.go index 9edcbb9d25..d42c74c7ad 100644 --- a/scheduler/pkg/store/in-memory/store.go +++ b/scheduler/pkg/store/in-memory/store.go @@ -10,6 +10,7 @@ the Change License after the Change Date as each is defined in accordance with t package in_memory import ( + "context" "fmt" "sync" @@ -38,7 +39,7 @@ func NewStorage[T interface { } } -func (s *Storage[T]) Get(id string) (T, error) { +func (s *Storage[T]) Get(_ context.Context, id string) (T, error) { s.mu.RLock() defer s.mu.RUnlock() @@ -50,7 +51,7 @@ func (s *Storage[T]) Get(id string) (T, error) { return proto.Clone(record).(T), nil } -func (s *Storage[T]) Insert(record T) error { +func (s *Storage[T]) Insert(_ context.Context, record T) error { s.mu.Lock() defer s.mu.Unlock() @@ -62,7 +63,7 @@ func (s *Storage[T]) Insert(record T) error { return nil } -func (s *Storage[T]) List() ([]T, error) { +func (s *Storage[T]) List(_ context.Context) ([]T, error) { s.mu.RLock() defer s.mu.RUnlock() @@ -74,7 +75,7 @@ func (s *Storage[T]) List() ([]T, error) { return records, nil } -func (s *Storage[T]) Update(record T) error { +func (s *Storage[T]) Update(_ context.Context, record T) error { s.mu.Lock() defer s.mu.Unlock() @@ -86,7 +87,7 @@ func (s *Storage[T]) Update(record T) error { return nil } -func (s *Storage[T]) Delete(id string) error { +func (s *Storage[T]) Delete(_ context.Context, id string) error { s.mu.Lock() defer s.mu.Unlock() diff --git a/scheduler/pkg/store/memory.go b/scheduler/pkg/store/memory.go index db5d01ec28..b83f603965 100644 --- a/scheduler/pkg/store/memory.go +++ b/scheduler/pkg/store/memory.go @@ -10,6 +10,7 @@ the Change License after the Change Date as each is defined in accordance with t package store import ( + "context" "errors" "fmt" "sort" @@ -59,7 +60,7 @@ func (m *ModelServerStore) GetAllModels() ([]string, error) { defer m.mu.RUnlock() var modelNames []string - models, err := m.store.models.List() + models, err := m.store.models.List(context.TODO()) if err != nil { return nil, err } @@ -73,18 +74,18 @@ func (m *ModelServerStore) GetAllModels() ([]string, error) { func (m *ModelServerStore) GetModels() ([]*db.Model, error) { m.mu.RLock() defer m.mu.RUnlock() - return m.store.models.List() + return m.store.models.List(context.TODO()) } func (m *ModelServerStore) addModelVersionIfNotExists(req *agent.ModelVersion) (*db.Model, *db.ModelVersion, error) { modelName := req.GetModel().GetMeta().GetName() - model, err := m.store.models.Get(modelName) + model, err := m.store.models.Get(context.TODO(), modelName) if err != nil { if !errors.Is(err, ErrNotFound) { return nil, nil, err } model = &db.Model{Name: modelName} - if err := m.store.models.Insert(model); err != nil { + if err := m.store.models.Insert(context.TODO(), model); err != nil { return nil, nil, err } } @@ -132,14 +133,14 @@ func (m *ModelServerStore) UpdateModel(req *pb.LoadModelRequest) error { modelName, ) } - model, err := m.store.models.Get(modelName) + model, err := m.store.models.Get(context.TODO(), modelName) if err != nil { if !errors.Is(err, ErrNotFound) { return fmt.Errorf("failed to get model %s: %w", modelName, err) } model = &db.Model{Name: modelName} m.addNextModelVersion(model, req.GetModel()) - if err := m.store.models.Insert(model); err != nil { + if err := m.store.models.Insert(context.TODO(), model); err != nil { return fmt.Errorf("failed to update model %s: %w", modelName, err) } return nil @@ -148,7 +149,7 @@ func (m *ModelServerStore) UpdateModel(req *pb.LoadModelRequest) error { if model.Deleted { if model.Inactive() { m.addNextModelVersion(model, req.GetModel()) - if err := m.store.models.Update(model); err != nil { + if err := m.store.models.Update(context.TODO(), model); err != nil { return fmt.Errorf("failed to update model %s: %w", modelName, err) } return nil @@ -187,7 +188,7 @@ func (m *ModelServerStore) UpdateModel(req *pb.LoadModelRequest) error { latestModel.ModelDefn.Meta.KubernetesMeta = req.GetModel().GetMeta().GetKubernetesMeta() } - if err := m.store.models.Update(model); err != nil { + if err := m.store.models.Update(context.TODO(), model); err != nil { return fmt.Errorf("failed to update model %s: %w", modelName, err) } return nil @@ -225,7 +226,7 @@ func (m *ModelServerStore) UnlockModel(modelId string) { func (m *ModelServerStore) GetModel(key string) (*db.Model, error) { m.mu.RLock() defer m.mu.RUnlock() - model, err := m.store.models.Get(key) + model, err := m.store.models.Get(context.TODO(), key) if err != nil { return nil, fmt.Errorf("failed to get model %s: %w", key, err) } @@ -245,7 +246,7 @@ func (m *ModelServerStore) removeModelImpl(req *pb.UnloadModelRequest) error { defer m.mu.Unlock() modelName := req.GetModel().GetName() - model, err := m.store.models.Get(modelName) + model, err := m.store.models.Get(context.TODO(), modelName) if err == nil { // Updating the k8s meta is required to be updated so status updates back (to manager) // will match latest generation value. Previous generation values might be ignored by manager. @@ -267,14 +268,14 @@ func (m *ModelServerStore) removeModelImpl(req *pb.UnloadModelRequest) error { func (m *ModelServerStore) GetServers(shallow bool, modelDetails bool) ([]*db.Server, error) { m.mu.RLock() defer m.mu.RUnlock() - return m.store.servers.List() + return m.store.servers.List(context.TODO()) } func (m *ModelServerStore) GetServer(serverKey string, _ bool, modelDetails bool) (*db.Server, *ServerStats, error) { m.mu.RLock() defer m.mu.RUnlock() - server, err := m.store.servers.Get(serverKey) + server, err := m.store.servers.Get(context.TODO(), serverKey) if err != nil { if errors.Is(err, ErrNotFound) { return nil, nil, fmt.Errorf("server [%s] not found", serverKey) @@ -317,7 +318,7 @@ func (m *ModelServerStore) getModelServer( serverKey string, ) (*db.Model, *db.ModelVersion, *db.Server, error) { // Validate - model, err := m.store.models.Get(modelKey) + model, err := m.store.models.Get(context.TODO(), modelKey) if err != nil { return nil, nil, nil, fmt.Errorf("failed to find model %s", modelKey) } @@ -325,7 +326,7 @@ func (m *ModelServerStore) getModelServer( if modelVersion == nil { return nil, nil, nil, fmt.Errorf("Version not found for model %s, version %d", modelKey, version) } - server, err := m.store.servers.Get(serverKey) + server, err := m.store.servers.Get(context.TODO(), serverKey) if err != nil { return nil, nil, nil, fmt.Errorf("failed to find server %s", serverKey) } @@ -362,7 +363,7 @@ func (m *ModelServerStore) updateLoadedModelsImpl( logger := m.logger.WithField("func", "updateLoadedModelsImpl") // Validate - model, err := m.store.models.Get(modelKey) + model, err := m.store.models.Get(context.TODO(), modelKey) if err != nil { if errors.Is(err, ErrNotFound) { return nil, fmt.Errorf("model [%s] not found", modelKey) @@ -389,7 +390,7 @@ func (m *ModelServerStore) updateLoadedModelsImpl( }, nil } - server, err := m.store.servers.Get(serverKey) + server, err := m.store.servers.Get(context.TODO(), serverKey) if err != nil { return nil, fmt.Errorf("failed to find server %s: %w", serverKey, err) } @@ -408,7 +409,7 @@ func (m *ModelServerStore) updateLoadedModelsImpl( for modelVersion.HasServer() && modelVersion.Server != serverKey { logger.Debugf("Adding new version as server changed to %s from %s", modelVersion.Server, serverKey) m.addNextModelVersion(model, model.Latest().ModelDefn) - if err := m.store.models.Update(model); err != nil { + if err := m.store.models.Update(context.TODO(), model); err != nil { return nil, fmt.Errorf("failed to update model %s: %w", modelKey, err) } modelVersion = model.Latest() @@ -507,7 +508,7 @@ func (m *ModelServerStore) unloadVersionModelsImpl(modelKey string, version uint defer m.mu.Unlock() // Validate - model, err := m.store.models.Get(modelKey) + model, err := m.store.models.Get(context.TODO(), modelKey) if err != nil { return nil, false, fmt.Errorf("failed to find model %s", modelKey) } @@ -608,10 +609,10 @@ func (m *ModelServerStore) updateModelState( } // TODO should be in a transaction to rollback if updating servers fail - if err := m.store.models.Update(res.model); err != nil { + if err := m.store.models.Update(context.TODO(), res.model); err != nil { return nil, nil, fmt.Errorf("failed to update model %s: %w", modelKey, err) } - if err := m.store.servers.Update(res.server); err != nil { + if err := m.store.servers.Update(context.TODO(), res.server); err != nil { return nil, nil, fmt.Errorf("failed to update server %s: %w", serverKey, err) } @@ -701,7 +702,7 @@ func (m *ModelServerStore) updateModelStateImpl( return nil, fmt.Errorf("update model status failed: %w", err) } - if err := m.store.servers.Update(server); err != nil { + if err := m.store.servers.Update(context.TODO(), server); err != nil { return nil, fmt.Errorf("failed to update server %s: %w", serverKey, err) } @@ -733,7 +734,7 @@ func (m *ModelServerStore) updateReservedMemory( ) error { // update reserved memory that is being used for sorting replicas // do we need to lock replica update? - server, err := m.store.servers.Get(serverKey) + server, err := m.store.servers.Get(context.TODO(), serverKey) if err != nil { return fmt.Errorf("get server %s failed: %w", serverKey, err) } @@ -752,7 +753,7 @@ func (m *ModelServerStore) updateReservedMemory( } if update { - return m.store.servers.Update(server) + return m.store.servers.Update(context.TODO(), server) } return nil } @@ -782,13 +783,13 @@ func (m *ModelServerStore) addServerReplicaImpl(request *agent.AgentSubscribeReq m.mu.Lock() defer m.mu.Unlock() - server, err := m.store.servers.Get(request.ServerName) + server, err := m.store.servers.Get(context.TODO(), request.ServerName) if err != nil { if !errors.Is(err, ErrNotFound) { return nil, coordinator.ServerEventMsg{}, err } server = NewServer(request.ServerName, request.Shared) - if err := m.store.servers.Insert(server); err != nil { + if err := m.store.servers.Insert(context.TODO(), server); err != nil { return nil, coordinator.ServerEventMsg{}, fmt.Errorf("failed to add server %s: %w", request.ServerName, err) } } @@ -806,7 +807,7 @@ func (m *ModelServerStore) addServerReplicaImpl(request *agent.AgentSubscribeReq server.AddReplica(int32(request.ReplicaIdx), serverReplica) - if err := m.store.servers.Update(server); err != nil { + if err := m.store.servers.Update(context.TODO(), server); err != nil { return nil, coordinator.ServerEventMsg{}, fmt.Errorf("failed to update server %s: %w", request.ServerName, err) } @@ -856,7 +857,7 @@ func (m *ModelServerStore) removeServerReplicaImpl(serverName string, replicaIdx m.mu.Lock() defer m.mu.Unlock() - server, err := m.store.servers.Get(serverName) + server, err := m.store.servers.Get(context.TODO(), serverName) if err != nil { return nil, nil, fmt.Errorf("failed to find server %s: %w", serverName, err) } @@ -866,13 +867,13 @@ func (m *ModelServerStore) removeServerReplicaImpl(serverName string, replicaIdx } delete(server.Replicas, int32(replicaIdx)) - if err := m.store.servers.Update(server); err != nil { + if err := m.store.servers.Update(context.TODO(), server); err != nil { return nil, nil, fmt.Errorf("failed to update server %s: %w", serverName, err) } // TODO we should not reschedule models on servers with dedicated models, e.g. non shareable servers if len(server.Replicas) == 0 { - if err := m.store.servers.Delete(serverName); err != nil { + if err := m.store.servers.Delete(context.TODO(), serverName); err != nil { return nil, nil, fmt.Errorf("failed to delete server %s: %w", serverName, err) } } @@ -898,7 +899,7 @@ func (m *ModelServerStore) removeModelfromServerReplica(models []*db.ModelVersio // Find models to reschedule due to this server replica being removed for _, v := range models { // TODO pointer issue - model, err := m.store.models.Get(v.Name) + model, err := m.store.models.Get(context.TODO(), v.Name) if err == nil { modelVersion := model.GetVersion(v.Version) if modelVersion != nil { @@ -929,7 +930,7 @@ func (m *ModelServerStore) removeModelfromServerReplica(models []*db.ModelVersio ) } else { modelNames = append(modelNames, v.Name) - if err := m.store.models.Update(model); err != nil { + if err := m.store.models.Update(context.TODO(), model); err != nil { return nil, nil, fmt.Errorf("failed to update model %s status: %w", model.Name, err) } } @@ -949,7 +950,7 @@ func (m *ModelServerStore) DrainServerReplica(serverName string, replicaIdx int) func (m *ModelServerStore) drainServerReplicaImpl(serverName string, replicaIdx int) ([]string, error) { logger := m.logger.WithField("func", "DrainServerReplica") - server, err := m.store.servers.Get(serverName) + server, err := m.store.servers.Get(context.TODO(), serverName) if err != nil { return nil, fmt.Errorf("failed to find server %s", serverName) } @@ -961,7 +962,7 @@ func (m *ModelServerStore) drainServerReplicaImpl(serverName string, replicaIdx // we mark this server replica as draining so should not be used in future scheduling decisions serverReplica.IsDraining = true - if err := m.store.servers.Update(server); err != nil { + if err := m.store.servers.Update(context.TODO(), server); err != nil { return nil, fmt.Errorf("failed to update server %s: %w", serverName, err) } @@ -988,12 +989,12 @@ func (m *ModelServerStore) findModelsToReSchedule(models []*db.ModelVersionID, r modelsReSchedule := make([]string, 0) for _, v := range models { - model, err := m.store.models.Get(v.Name) + model, err := m.store.models.Get(context.TODO(), v.Name) if err == nil { modelVersion := model.GetVersion(v.Version) if modelVersion != nil { modelVersion.SetReplicaState(replicaIdx, db.ModelReplicaState_Draining, "trigger to drain") - if err := m.store.models.Update(model); err != nil { + if err := m.store.models.Update(context.TODO(), model); err != nil { return nil, fmt.Errorf("failed update model %s: %w", model.Name, err) } modelsReSchedule = append(modelsReSchedule, model.Name) @@ -1013,7 +1014,7 @@ func (m *ModelServerStore) ServerNotify(request *pb.ServerNotify) error { logger.Debugf("ServerNotify %v", request) - server, err := m.store.servers.Get(request.Name) + server, err := m.store.servers.Get(context.TODO(), request.Name) if err != nil { if !errors.Is(err, ErrNotFound) { return fmt.Errorf("failed to find server %s: %w", request.Name, err) @@ -1023,7 +1024,7 @@ func (m *ModelServerStore) ServerNotify(request *pb.ServerNotify) error { server.MinReplicas = int32(request.MinReplicas) server.MaxReplicas = int32(request.MaxReplicas) server.KubernetesMeta = request.KubernetesMeta - if err := m.store.servers.Insert(server); err != nil { + if err := m.store.servers.Insert(context.TODO(), server); err != nil { return err } return nil @@ -1034,7 +1035,7 @@ func (m *ModelServerStore) ServerNotify(request *pb.ServerNotify) error { server.MaxReplicas = int32(request.MaxReplicas) server.KubernetesMeta = request.KubernetesMeta - if err := m.store.servers.Update(server); err != nil { + if err := m.store.servers.Update(context.TODO(), server); err != nil { return fmt.Errorf("failed to update server %s: %w", request.Name, err) } return nil @@ -1042,7 +1043,7 @@ func (m *ModelServerStore) ServerNotify(request *pb.ServerNotify) error { func (m *ModelServerStore) numEmptyServerReplicas(serverName string) (uint32, error) { emptyReplicas := uint32(0) - server, err := m.store.servers.Get(serverName) + server, err := m.store.servers.Get(context.TODO(), serverName) if err != nil { if !errors.Is(err, ErrNotFound) { return 0, err @@ -1058,7 +1059,7 @@ func (m *ModelServerStore) numEmptyServerReplicas(serverName string) (uint32, er } func (m *ModelServerStore) maxNumModelReplicasForServer(serverName string) (uint32, error) { - models, err := m.store.models.List() + models, err := m.store.models.List(context.TODO()) if err != nil { return 0, err } @@ -1107,7 +1108,7 @@ func (m *ModelServerStore) setModelGwModelStateImpl(name string, versionNumber u defer m.mu.Unlock() var evts []*coordinator.ModelEventMsg - model, err := m.store.models.Get(name) + model, err := m.store.models.Get(context.TODO(), name) if err != nil { return nil, fmt.Errorf("failed to find model %s", name) } @@ -1119,7 +1120,7 @@ func (m *ModelServerStore) setModelGwModelStateImpl(name string, versionNumber u if modelVersion.State.ModelGwState != status || modelVersion.State.ModelGwReason != reason { modelVersion.State.ModelGwState = status modelVersion.State.ModelGwReason = reason - if err := m.store.models.Update(model); err != nil { + if err := m.store.models.Update(context.TODO(), model); err != nil { return nil, fmt.Errorf("failed to update model %s: %w", name, err) } evt := &coordinator.ModelEventMsg{ @@ -1133,11 +1134,11 @@ func (m *ModelServerStore) setModelGwModelStateImpl(name string, versionNumber u } func (m *ModelServerStore) EmitEvents() error { - servers, err := m.store.servers.List() + servers, err := m.store.servers.List(context.TODO()) if err != nil { return err } - models, err := m.store.models.List() + models, err := m.store.models.List(context.TODO()) if err != nil { return err } diff --git a/scheduler/pkg/store/memory_status.go b/scheduler/pkg/store/memory_status.go index 1fa61c0472..090708e171 100644 --- a/scheduler/pkg/store/memory_status.go +++ b/scheduler/pkg/store/memory_status.go @@ -10,6 +10,7 @@ the Change License after the Change Date as each is defined in accordance with t package store import ( + "context" "fmt" "time" @@ -125,7 +126,7 @@ func (m *ModelServerStore) FailedScheduling(modelID string, version uint32, reas m.mu.Lock() defer m.mu.Unlock() - model, err := m.store.models.Get(modelID) + model, err := m.store.models.Get(context.TODO(), modelID) if err != nil { return fmt.Errorf("model %s not found: %w", modelID, err) } @@ -151,7 +152,7 @@ func (m *ModelServerStore) FailedScheduling(modelID string, version uint32, reas modelVersion.Server = "" } - if err := m.store.models.Update(model); err != nil { + if err := m.store.models.Update(context.TODO(), model); err != nil { return fmt.Errorf("failed to update model %s: %w", modelID, err) } @@ -176,7 +177,7 @@ func (m *ModelServerStore) updateModelStatus(isLatest bool, deleted bool, modelV logger.Debugf("Stats %+v modelVersion %+v prev model %+v", stats, modelVersion, prevModelVersion) updateModelState(isLatest, modelVersion, prevModelVersion, stats, deleted) - if err := m.store.models.Update(model); err != nil { + if err := m.store.models.Update(context.TODO(), model); err != nil { return fmt.Errorf("failed to update model: %w", err) } return nil @@ -196,7 +197,7 @@ func (m *ModelServerStore) UnloadModelGwVersionModels(modelKey string, version u m.mu.Lock() defer m.mu.Unlock() - model, err := m.store.models.Get(modelKey) + model, err := m.store.models.Get(context.TODO(), modelKey) if err != nil { return false, fmt.Errorf("failed to find model %s: %w", modelKey, err) } @@ -208,7 +209,7 @@ func (m *ModelServerStore) UnloadModelGwVersionModels(modelKey string, version u m.setModelGwStatusToTerminate(false, modelVersion) - if err := m.store.models.Update(model); err != nil { + if err := m.store.models.Update(context.TODO(), model); err != nil { return false, fmt.Errorf("failed to update model %s: %w", modelKey, err) } diff --git a/scheduler/pkg/store/storage.go b/scheduler/pkg/store/storage.go index ba1fb3d522..1954da8d5b 100644 --- a/scheduler/pkg/store/storage.go +++ b/scheduler/pkg/store/storage.go @@ -10,7 +10,9 @@ the Change License after the Change Date as each is defined in accordance with t package store import ( + "context" "errors" + "google.golang.org/protobuf/proto" ) @@ -19,9 +21,9 @@ var ( ) type Storage[T proto.Message] interface { - Get(name string) (T, error) - Insert(record T) error - List() ([]T, error) - Update(record T) error - Delete(name string) error + Get(ctx context.Context, name string) (T, error) + Insert(ctx context.Context, record T) error + List(ctx context.Context) ([]T, error) + Update(ctx context.Context, record T) error + Delete(ctx context.Context, name string) error } diff --git a/scheduler/pkg/store/test_memory_hack.go b/scheduler/pkg/store/test_memory_hack.go index 4256a91011..403b245721 100644 --- a/scheduler/pkg/store/test_memory_hack.go +++ b/scheduler/pkg/store/test_memory_hack.go @@ -10,6 +10,7 @@ the Change License after the Change Date as each is defined in accordance with t package store import ( + "context" "errors" "fmt" "testing" @@ -50,7 +51,7 @@ func (t *TestMemoryStore) DirectlyUpdateModelStatus(model ModelID, state *db.Mod t.mu.Lock() defer t.mu.Unlock() - found, err := t.store.models.Get(model.Name) + found, err := t.store.models.Get(context.TODO(), model.Name) if err != nil { return fmt.Errorf("model not found: %w", err) } @@ -61,5 +62,5 @@ func (t *TestMemoryStore) DirectlyUpdateModelStatus(model ModelID, state *db.Mod } version.State = state - return t.store.models.Update(found) + return t.store.models.Update(context.TODO(), found) } From 64b9cf07a9099ea4d9da35ae1f7f865928fc2225 Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Mon, 5 Jan 2026 19:55:52 +0000 Subject: [PATCH 13/50] fix states --- apis/go/mlops/scheduler/db/model_ext.go | 2 +- scheduler/pkg/store/memory.go | 7 +++++-- scheduler/pkg/store/memory_status.go | 2 +- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/apis/go/mlops/scheduler/db/model_ext.go b/apis/go/mlops/scheduler/db/model_ext.go index 6e032d7f06..4d1eaa1887 100644 --- a/apis/go/mlops/scheduler/db/model_ext.go +++ b/apis/go/mlops/scheduler/db/model_ext.go @@ -297,7 +297,7 @@ func (m ModelReplicaState) AlreadyLoadingOrLoaded() bool { func (m ModelReplicaState) UnloadingOrUnloaded() bool { return m == ModelReplicaState_UnloadEnvoyRequested || m == ModelReplicaState_UnloadRequested || - m == ModelReplicaState_Loading || + m == ModelReplicaState_Unloading || m == ModelReplicaState_Unloaded || m == ModelReplicaState_ModelReplicaStateUnknown } diff --git a/scheduler/pkg/store/memory.go b/scheduler/pkg/store/memory.go index b83f603965..7ea8642d1e 100644 --- a/scheduler/pkg/store/memory.go +++ b/scheduler/pkg/store/memory.go @@ -423,7 +423,7 @@ func (m *ModelServerStore) updateLoadedModelsImpl( "Model %s version %d on server %s replica %d does not exist yet and should be loaded", modelKey, modelVersion.Version, serverKey, replicaIdx, ) - modelVersion.SetReplicaState(int(replicaIdx), db.ModelReplicaState_UnloadRequested, "") + modelVersion.SetReplicaState(int(replicaIdx), db.ModelReplicaState_LoadRequested, "") if err := m.updateReservedMemory(db.ModelReplicaState_LoadRequested, serverKey, int(replicaIdx), modelVersion.GetRequiredMemory()); err != nil { return nil, fmt.Errorf("failed to update server %s replica %d: %w", serverKey, replicaIdx, err) @@ -459,6 +459,10 @@ func (m *ModelServerStore) updateLoadedModelsImpl( } } + if err := m.store.models.Update(context.TODO(), model); err != nil { + return nil, fmt.Errorf("failed to update model %s: %w", modelKey, err) + } + // in cases where we did have a previous ScheduleFailed, we need to reflect the change here // this could be in the cases where we are scaling down a model and the new replica count can be all deployed // and always send an update for deleted models, so the operator will remove them from k8s @@ -804,7 +808,6 @@ func (m *ModelServerStore) addServerReplicaImpl(request *agent.AgentSubscribeReq request.ReplicaConfig, request.AvailableMemoryBytes, ) - server.AddReplica(int32(request.ReplicaIdx), serverReplica) if err := m.store.servers.Update(context.TODO(), server); err != nil { diff --git a/scheduler/pkg/store/memory_status.go b/scheduler/pkg/store/memory_status.go index 090708e171..cdfc6b4d09 100644 --- a/scheduler/pkg/store/memory_status.go +++ b/scheduler/pkg/store/memory_status.go @@ -82,7 +82,7 @@ func updateModelState(isLatest bool, modelVersion *db.ModelVersion, prevModelVer modelTimestamp := stats.latestTime if deleted || !isLatest { if stats.replicasUnloadFailed > 0 { - modelState = db.ModelState_ModelTerminated + modelState = db.ModelState_ModelTerminateFailed modelReason = stats.lastFailedReason modelTimestamp = stats.lastFailedStateTime } else if stats.replicasUnloading > 0 || stats.replicasAvailable > 0 || stats.replicasLoading > 0 { From efa9a254d303aa59f8feb123528d19d13762cdb7 Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Tue, 6 Jan 2026 10:20:15 +0000 Subject: [PATCH 14/50] wip: fix auto-scaling where expected replicas being cast to -1 during uint32 to int32 conversion --- apis/go/mlops/scheduler/db/db.pb.go | 18 ++++++------- apis/mlops/scheduler/db/db.proto | 6 ++--- .../controllers/mlops/server_controller.go | 2 +- operator/scheduler/server.go | 7 ++++- scheduler/pkg/scheduler/scheduler.go | 2 +- scheduler/pkg/server/server_status.go | 26 ++++++++++++++++--- scheduler/pkg/store/memory.go | 12 ++++----- 7 files changed, 48 insertions(+), 25 deletions(-) diff --git a/apis/go/mlops/scheduler/db/db.pb.go b/apis/go/mlops/scheduler/db/db.pb.go index ff34ef0d58..76a4206da0 100644 --- a/apis/go/mlops/scheduler/db/db.pb.go +++ b/apis/go/mlops/scheduler/db/db.pb.go @@ -691,9 +691,9 @@ type Server struct { Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` Replicas map[int32]*ServerReplica `protobuf:"bytes,2,rep,name=replicas,proto3" json:"replicas,omitempty" protobuf_key:"varint,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` Shared bool `protobuf:"varint,3,opt,name=shared,proto3" json:"shared,omitempty"` - ExpectedReplicas int32 `protobuf:"varint,4,opt,name=expected_replicas,json=expectedReplicas,proto3" json:"expected_replicas,omitempty"` - MinReplicas int32 `protobuf:"varint,5,opt,name=min_replicas,json=minReplicas,proto3" json:"min_replicas,omitempty"` - MaxReplicas int32 `protobuf:"varint,6,opt,name=max_replicas,json=maxReplicas,proto3" json:"max_replicas,omitempty"` + ExpectedReplicas uint32 `protobuf:"varint,4,opt,name=expected_replicas,json=expectedReplicas,proto3" json:"expected_replicas,omitempty"` + MinReplicas uint32 `protobuf:"varint,5,opt,name=min_replicas,json=minReplicas,proto3" json:"min_replicas,omitempty"` + MaxReplicas uint32 `protobuf:"varint,6,opt,name=max_replicas,json=maxReplicas,proto3" json:"max_replicas,omitempty"` KubernetesMeta *scheduler.KubernetesMeta `protobuf:"bytes,7,opt,name=kubernetes_meta,json=kubernetesMeta,proto3" json:"kubernetes_meta,omitempty"` } @@ -750,21 +750,21 @@ func (x *Server) GetShared() bool { return false } -func (x *Server) GetExpectedReplicas() int32 { +func (x *Server) GetExpectedReplicas() uint32 { if x != nil { return x.ExpectedReplicas } return 0 } -func (x *Server) GetMinReplicas() int32 { +func (x *Server) GetMinReplicas() uint32 { if x != nil { return x.MinReplicas } return 0 } -func (x *Server) GetMaxReplicas() int32 { +func (x *Server) GetMaxReplicas() uint32 { if x != nil { return x.MaxReplicas } @@ -1040,12 +1040,12 @@ var file_mlops_scheduler_db_db_proto_rawDesc = []byte{ 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x18, 0x03, 0x20, 0x01, 0x28, 0x08, 0x52, 0x06, 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x12, 0x2b, 0x0a, 0x11, 0x65, 0x78, 0x70, 0x65, 0x63, 0x74, 0x65, 0x64, 0x5f, 0x72, 0x65, 0x70, 0x6c, 0x69, - 0x63, 0x61, 0x73, 0x18, 0x04, 0x20, 0x01, 0x28, 0x05, 0x52, 0x10, 0x65, 0x78, 0x70, 0x65, 0x63, + 0x63, 0x61, 0x73, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x10, 0x65, 0x78, 0x70, 0x65, 0x63, 0x74, 0x65, 0x64, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x12, 0x21, 0x0a, 0x0c, 0x6d, 0x69, 0x6e, 0x5f, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x18, 0x05, 0x20, 0x01, 0x28, - 0x05, 0x52, 0x0b, 0x6d, 0x69, 0x6e, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x12, 0x21, + 0x0d, 0x52, 0x0b, 0x6d, 0x69, 0x6e, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x12, 0x21, 0x0a, 0x0c, 0x6d, 0x61, 0x78, 0x5f, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x18, 0x06, - 0x20, 0x01, 0x28, 0x05, 0x52, 0x0b, 0x6d, 0x61, 0x78, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, + 0x20, 0x01, 0x28, 0x0d, 0x52, 0x0b, 0x6d, 0x61, 0x78, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x12, 0x4f, 0x0a, 0x0f, 0x6b, 0x75, 0x62, 0x65, 0x72, 0x6e, 0x65, 0x74, 0x65, 0x73, 0x5f, 0x6d, 0x65, 0x74, 0x61, 0x18, 0x07, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x26, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, diff --git a/apis/mlops/scheduler/db/db.proto b/apis/mlops/scheduler/db/db.proto index 90f6594417..36d68ead6f 100644 --- a/apis/mlops/scheduler/db/db.proto +++ b/apis/mlops/scheduler/db/db.proto @@ -108,9 +108,9 @@ message Server { string name = 1; map replicas = 2; bool shared = 3; - int32 expected_replicas = 4; - int32 min_replicas = 5; - int32 max_replicas = 6; + uint32 expected_replicas = 4; + uint32 min_replicas = 5; + uint32 max_replicas = 6; seldon.mlops.scheduler.KubernetesMeta kubernetes_meta = 7; } diff --git a/operator/controllers/mlops/server_controller.go b/operator/controllers/mlops/server_controller.go index a130b4f180..4bc4204769 100644 --- a/operator/controllers/mlops/server_controller.go +++ b/operator/controllers/mlops/server_controller.go @@ -203,7 +203,7 @@ func (r *ServerReconciler) updateStatus(ctx context.Context, server *mlopsv1alph } else { if err := r.Status().Update(ctx, server); err != nil { r.Recorder.Eventf(server, v1.EventTypeWarning, "UpdateFailed", - "Failed to update status for Model %q: %v", server.Name, err) + "Failed to update status for Server %q: %v", server.Name, err) return err } else { prevWasReady := serverReady(existingServer.Status) diff --git a/operator/scheduler/server.go b/operator/scheduler/server.go index c243fc55a4..eec1a942fb 100644 --- a/operator/scheduler/server.go +++ b/operator/scheduler/server.go @@ -140,7 +140,7 @@ func (s *SchedulerClient) SubscribeServerEvents(ctx context.Context, grpcClient return err } - logger.Info("Received event", "server", event.ServerName) + logger.Info("Received event", "server", event.ServerName, "event", event) if event.GetKubernetesMeta() == nil { logger.Info("Received server event with no k8s metadata so ignoring", "server", event.ServerName) @@ -173,6 +173,7 @@ func (s *SchedulerClient) SubscribeServerEvents(ctx context.Context, grpcClient server, ); err != nil { if errors.IsNotFound(err) { + logger.Info("Server does not exist", "server", event.ServerName, "event", event) return nil } return err @@ -190,11 +191,14 @@ func (s *SchedulerClient) SubscribeServerEvents(ctx context.Context, grpcClient // At the moment, the scheduler doesn't send multiple types of updates in a single event; switch event.GetType() { case scheduler.ServerStatusResponse_StatusUpdate: + logger.Info("Received status update event", "server", event.ServerName, "event", event) server.Status.LoadedModelReplicas = event.NumLoadedModelReplicas return s.updateServerStatus(contextWithTimeout, server) case scheduler.ServerStatusResponse_ScalingRequest: + logger.Info("Received scaling request", "server", event.ServerName, "event", event) return s.scaleServerReplicas(contextWithTimeout, server, event) default: // we ignore unknown event types + logger.Info("Received unknown event, ignoring", "server", event.ServerName, "event", event) return nil } }) @@ -229,6 +233,7 @@ func (s *SchedulerClient) scaleServerReplicas(ctx context.Context, server *v1alp if err := s.Patch(ctx, newServer, client.MergeFrom(server)); err != nil { if errors.IsNotFound(err) { + s.logger.Error(err, "Scaling server failed, server not found", "server", server.Name, "event", event) return nil } s.recorder.Eventf(server, v1.EventTypeWarning, "PatchFailed", diff --git a/scheduler/pkg/scheduler/scheduler.go b/scheduler/pkg/scheduler/scheduler.go index 23d31a6008..3a65180af3 100644 --- a/scheduler/pkg/scheduler/scheduler.go +++ b/scheduler/pkg/scheduler/scheduler.go @@ -255,7 +255,7 @@ func (s *SimpleScheduler) scheduleToServer(modelName string) (*coordinator.Serve if err := s.store.FailedScheduling(latestModel.ModelName(), latestModel.GetVersion(), msg, !latestModel.HasLiveReplicas() && !latestModel.IsLoadingOrLoadedOnServer()); err != nil { - return nil, fmt.Errorf("%s: got error making model as failed in memory store: %w", msg, err) + return nil, fmt.Errorf("%s: error marking model as failed in memory store: %w", msg, err) } return serverEvent, errors.New(msg) } diff --git a/scheduler/pkg/server/server_status.go b/scheduler/pkg/server/server_status.go index eaa3a2acdb..c22098f02a 100644 --- a/scheduler/pkg/server/server_status.go +++ b/scheduler/pkg/server/server_status.go @@ -707,8 +707,12 @@ func (s *SchedulerServer) handleModelEventForServerStatus(event coordinator.Mode } func (s *SchedulerServer) handleServerEvents(event coordinator.ServerEventMsg) { - logger := s.logger.WithField("func", "handleServerEvents") - logger.Debugf("Got server state %s change for %s", event.ServerName, event.String()) + logger := s.logger.WithFields(logrus.Fields{ + "server": event.ServerName, + "event": event, + "func": "handleServerEvents", + }) + logger.Info("Got server event") server, stats, err := s.modelStore.GetServer(event.ServerName, true, true) if err != nil { @@ -716,18 +720,32 @@ func (s *SchedulerServer) handleServerEvents(event coordinator.ServerEventMsg) { return } + logger.Debugf("Retrieved from store: server %s stats %+v server %+v", event.ServerName, stats, server) + if s.config.AutoScalingServerEnabled { - if event.UpdateContext == coordinator.SERVER_SCALE_DOWN { + switch event.UpdateContext { + case coordinator.SERVER_SCALE_DOWN: if ok, replicas := shouldScaleDown(server, stats, float32(s.config.PackThreshold)); ok { logger.Infof("Server %s is scaling down to %d", event.ServerName, replicas) s.sendServerScale(server, replicas) + return } - } else if event.UpdateContext == coordinator.SERVER_SCALE_UP { + logger.Info("Scale-down requested but not allowed") + case coordinator.SERVER_SCALE_UP: if ok, replicas := shouldScaleUp(server, stats); ok { logger.Infof("Server %s is scaling up to %d", event.ServerName, replicas) s.sendServerScale(server, replicas) + return } + logger.Info("Scale-up requested but not allowed") + default: + logger.Warnf("Server event context %d not recognised", event.UpdateContext) } + return + } + + if event.UpdateContext == coordinator.SERVER_SCALE_UP || event.UpdateContext == coordinator.SERVER_SCALE_DOWN { + logger.Info("Ignoring scale up/down request for server as auto-scaling disabled") } } diff --git a/scheduler/pkg/store/memory.go b/scheduler/pkg/store/memory.go index 7ea8642d1e..ec13df91df 100644 --- a/scheduler/pkg/store/memory.go +++ b/scheduler/pkg/store/memory.go @@ -1023,9 +1023,9 @@ func (m *ModelServerStore) ServerNotify(request *pb.ServerNotify) error { return fmt.Errorf("failed to find server %s: %w", request.Name, err) } server = NewServer(request.Name, request.Shared) - server.ExpectedReplicas = int32(request.ExpectedReplicas) - server.MinReplicas = int32(request.MinReplicas) - server.MaxReplicas = int32(request.MaxReplicas) + server.ExpectedReplicas = request.ExpectedReplicas + server.MinReplicas = request.MinReplicas + server.MaxReplicas = request.MaxReplicas server.KubernetesMeta = request.KubernetesMeta if err := m.store.servers.Insert(context.TODO(), server); err != nil { return err @@ -1033,9 +1033,9 @@ func (m *ModelServerStore) ServerNotify(request *pb.ServerNotify) error { return nil } - server.ExpectedReplicas = int32(request.ExpectedReplicas) - server.MinReplicas = int32(request.MinReplicas) - server.MaxReplicas = int32(request.MaxReplicas) + server.ExpectedReplicas = request.ExpectedReplicas + server.MinReplicas = request.MinReplicas + server.MaxReplicas = request.MaxReplicas server.KubernetesMeta = request.KubernetesMeta if err := m.store.servers.Update(context.TODO(), server); err != nil { From d80c203f9b8b37377c62f954ce899be378fffb19 Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Tue, 6 Jan 2026 11:45:15 +0000 Subject: [PATCH 15/50] fix auto scaling --- apis/go/mlops/scheduler/db/db.pb.go | 18 +++++++++--------- apis/mlops/scheduler/db/db.proto | 6 +++--- scheduler/pkg/store/memory.go | 18 +++++++++--------- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/apis/go/mlops/scheduler/db/db.pb.go b/apis/go/mlops/scheduler/db/db.pb.go index 76a4206da0..fd4ec606cd 100644 --- a/apis/go/mlops/scheduler/db/db.pb.go +++ b/apis/go/mlops/scheduler/db/db.pb.go @@ -691,9 +691,9 @@ type Server struct { Name string `protobuf:"bytes,1,opt,name=name,proto3" json:"name,omitempty"` Replicas map[int32]*ServerReplica `protobuf:"bytes,2,rep,name=replicas,proto3" json:"replicas,omitempty" protobuf_key:"varint,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` Shared bool `protobuf:"varint,3,opt,name=shared,proto3" json:"shared,omitempty"` - ExpectedReplicas uint32 `protobuf:"varint,4,opt,name=expected_replicas,json=expectedReplicas,proto3" json:"expected_replicas,omitempty"` - MinReplicas uint32 `protobuf:"varint,5,opt,name=min_replicas,json=minReplicas,proto3" json:"min_replicas,omitempty"` - MaxReplicas uint32 `protobuf:"varint,6,opt,name=max_replicas,json=maxReplicas,proto3" json:"max_replicas,omitempty"` + ExpectedReplicas int64 `protobuf:"varint,4,opt,name=expected_replicas,json=expectedReplicas,proto3" json:"expected_replicas,omitempty"` + MinReplicas int64 `protobuf:"varint,5,opt,name=min_replicas,json=minReplicas,proto3" json:"min_replicas,omitempty"` + MaxReplicas int64 `protobuf:"varint,6,opt,name=max_replicas,json=maxReplicas,proto3" json:"max_replicas,omitempty"` KubernetesMeta *scheduler.KubernetesMeta `protobuf:"bytes,7,opt,name=kubernetes_meta,json=kubernetesMeta,proto3" json:"kubernetes_meta,omitempty"` } @@ -750,21 +750,21 @@ func (x *Server) GetShared() bool { return false } -func (x *Server) GetExpectedReplicas() uint32 { +func (x *Server) GetExpectedReplicas() int64 { if x != nil { return x.ExpectedReplicas } return 0 } -func (x *Server) GetMinReplicas() uint32 { +func (x *Server) GetMinReplicas() int64 { if x != nil { return x.MinReplicas } return 0 } -func (x *Server) GetMaxReplicas() uint32 { +func (x *Server) GetMaxReplicas() int64 { if x != nil { return x.MaxReplicas } @@ -1040,12 +1040,12 @@ var file_mlops_scheduler_db_db_proto_rawDesc = []byte{ 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x12, 0x16, 0x0a, 0x06, 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x18, 0x03, 0x20, 0x01, 0x28, 0x08, 0x52, 0x06, 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x12, 0x2b, 0x0a, 0x11, 0x65, 0x78, 0x70, 0x65, 0x63, 0x74, 0x65, 0x64, 0x5f, 0x72, 0x65, 0x70, 0x6c, 0x69, - 0x63, 0x61, 0x73, 0x18, 0x04, 0x20, 0x01, 0x28, 0x0d, 0x52, 0x10, 0x65, 0x78, 0x70, 0x65, 0x63, + 0x63, 0x61, 0x73, 0x18, 0x04, 0x20, 0x01, 0x28, 0x03, 0x52, 0x10, 0x65, 0x78, 0x70, 0x65, 0x63, 0x74, 0x65, 0x64, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x12, 0x21, 0x0a, 0x0c, 0x6d, 0x69, 0x6e, 0x5f, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x18, 0x05, 0x20, 0x01, 0x28, - 0x0d, 0x52, 0x0b, 0x6d, 0x69, 0x6e, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x12, 0x21, + 0x03, 0x52, 0x0b, 0x6d, 0x69, 0x6e, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x12, 0x21, 0x0a, 0x0c, 0x6d, 0x61, 0x78, 0x5f, 0x72, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x18, 0x06, - 0x20, 0x01, 0x28, 0x0d, 0x52, 0x0b, 0x6d, 0x61, 0x78, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, + 0x20, 0x01, 0x28, 0x03, 0x52, 0x0b, 0x6d, 0x61, 0x78, 0x52, 0x65, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x73, 0x12, 0x4f, 0x0a, 0x0f, 0x6b, 0x75, 0x62, 0x65, 0x72, 0x6e, 0x65, 0x74, 0x65, 0x73, 0x5f, 0x6d, 0x65, 0x74, 0x61, 0x18, 0x07, 0x20, 0x01, 0x28, 0x0b, 0x32, 0x26, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, diff --git a/apis/mlops/scheduler/db/db.proto b/apis/mlops/scheduler/db/db.proto index 36d68ead6f..9bee8037ff 100644 --- a/apis/mlops/scheduler/db/db.proto +++ b/apis/mlops/scheduler/db/db.proto @@ -108,9 +108,9 @@ message Server { string name = 1; map replicas = 2; bool shared = 3; - uint32 expected_replicas = 4; - uint32 min_replicas = 5; - uint32 max_replicas = 6; + int64 expected_replicas = 4; + int64 min_replicas = 5; + int64 max_replicas = 6; seldon.mlops.scheduler.KubernetesMeta kubernetes_meta = 7; } diff --git a/scheduler/pkg/store/memory.go b/scheduler/pkg/store/memory.go index ec13df91df..9b4f5725de 100644 --- a/scheduler/pkg/store/memory.go +++ b/scheduler/pkg/store/memory.go @@ -1017,27 +1017,27 @@ func (m *ModelServerStore) ServerNotify(request *pb.ServerNotify) error { logger.Debugf("ServerNotify %v", request) + set := func(s *db.Server) { + s.ExpectedReplicas = int64(request.ExpectedReplicas) + s.MinReplicas = int64(request.MinReplicas) + s.MaxReplicas = int64(request.MaxReplicas) + s.KubernetesMeta = request.KubernetesMeta + } + server, err := m.store.servers.Get(context.TODO(), request.Name) if err != nil { if !errors.Is(err, ErrNotFound) { return fmt.Errorf("failed to find server %s: %w", request.Name, err) } server = NewServer(request.Name, request.Shared) - server.ExpectedReplicas = request.ExpectedReplicas - server.MinReplicas = request.MinReplicas - server.MaxReplicas = request.MaxReplicas - server.KubernetesMeta = request.KubernetesMeta + set(server) if err := m.store.servers.Insert(context.TODO(), server); err != nil { return err } return nil } - server.ExpectedReplicas = request.ExpectedReplicas - server.MinReplicas = request.MinReplicas - server.MaxReplicas = request.MaxReplicas - server.KubernetesMeta = request.KubernetesMeta - + set(server) if err := m.store.servers.Update(context.TODO(), server); err != nil { return fmt.Errorf("failed to update server %s: %w", request.Name, err) } From a57aac4876ec0b06e9122bc2bf8f360bd5443bbe Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Tue, 6 Jan 2026 15:22:55 +0000 Subject: [PATCH 16/50] server locks to prevent overwriting with stale data --- scheduler/pkg/agent/agent_svc_manager.go | 2 +- scheduler/pkg/agent/server.go | 10 +++--- scheduler/pkg/envoy/processor/incremental.go | 7 ++++- .../pkg/scheduler/filters/replicamemory.go | 3 +- scheduler/pkg/server/server.go | 3 ++ scheduler/pkg/store/api.go | 2 ++ scheduler/pkg/store/memory.go | 31 +++++++++++++++++-- 7 files changed, 49 insertions(+), 9 deletions(-) diff --git a/scheduler/pkg/agent/agent_svc_manager.go b/scheduler/pkg/agent/agent_svc_manager.go index 44afc79332..4909d5976f 100644 --- a/scheduler/pkg/agent/agent_svc_manager.go +++ b/scheduler/pkg/agent/agent_svc_manager.go @@ -571,7 +571,7 @@ func (am *AgentServiceManager) handleSchedulerSubscription() error { if err != nil { return fmt.Errorf("failed waiting to check if pod's IP is published to endpoints: %v", err) } - logger.Debug("Found IP in endpoints") + logger.Info("Found IP in endpoints - waiting for scheduler instructions") } // Start the main control loop for the agent<-scheduler stream diff --git a/scheduler/pkg/agent/server.go b/scheduler/pkg/agent/server.go index 332b390f03..7f2bf41c9a 100644 --- a/scheduler/pkg/agent/server.go +++ b/scheduler/pkg/agent/server.go @@ -283,7 +283,7 @@ func (s *Server) Sync(modelName string) { // Handle any load requests for latest version - we don't want to load models from older versions if latestModel != nil { for _, replicaIdx := range latestModel.GetReplicaForState(db.ModelReplicaState_LoadRequested) { - logger.Infof("Sending load model request for %s", modelName) + logger.Infof("Sending agent load model request for %s", modelName) as, ok := s.agents[ServerKey{serverName: latestModel.Server, replicaIdx: uint32(replicaIdx)}] @@ -321,7 +321,7 @@ func (s *Server) Sync(modelName string) { // Loop through all versions and unload any requested - any version of a model might have an unload request for _, modelVersion := range model.Versions { for _, replicaIdx := range modelVersion.GetReplicaForState(db.ModelReplicaState_UnloadRequested) { - s.logger.Infof("Sending unload model request for %s:%d", modelName, modelVersion.GetVersion()) + s.logger.Infof("Sending agent unload model request for %s:%d", modelName, modelVersion.GetVersion()) as, ok := s.agents[ServerKey{serverName: modelVersion.Server, replicaIdx: uint32(replicaIdx)}] if !ok { logger.Errorf("Failed to find server replica for %s:%d", modelVersion.Server, replicaIdx) @@ -360,6 +360,9 @@ func (s *Server) AgentDrain(ctx context.Context, message *pb.AgentDrainRequest) } func (s *Server) AgentEvent(ctx context.Context, message *pb.ModelEventMessage) (*pb.ModelEventResponse, error) { + s.store.LockModel(message.ModelName) + defer s.store.UnlockModel(message.ModelName) + logger := s.logger.WithField("func", "AgentEvent") var desiredState db.ModelReplicaState var expectedState db.ModelReplicaState @@ -380,9 +383,8 @@ func (s *Server) AgentEvent(ctx context.Context, message *pb.ModelEventMessage) default: desiredState = db.ModelReplicaState_ModelReplicaStateUnknown } + logger.Infof("Updating state for model %s to %s", message.ModelName, desiredState.String()) - s.store.LockModel(message.ModelName) - defer s.store.UnlockModel(message.ModelName) err := s.store.UpdateModelState(message.ModelName, message.GetModelVersion(), message.ServerName, int(message.ReplicaIdx), &message.AvailableMemoryBytes, expectedState, desiredState, message.GetMessage(), message.GetRuntimeInfo()) if err != nil { logger.WithError(err).Infof("Failed Updating state for model %s", message.ModelName) diff --git a/scheduler/pkg/envoy/processor/incremental.go b/scheduler/pkg/envoy/processor/incremental.go index 7130506e79..c7515284b7 100644 --- a/scheduler/pkg/envoy/processor/incremental.go +++ b/scheduler/pkg/envoy/processor/incremental.go @@ -260,6 +260,8 @@ func (p *IncrementalProcessor) addModelTraffic(routeName string, model *db.Model if err != nil { return err } + p.modelStore.LockServer(server.Name) + defer p.modelStore.UnlockServer(server.Name) lastAvailableModelVersion := model.GetLastAvailableModel() if lastAvailableModelVersion != nil && latestModel.GetVersion() != lastAvailableModelVersion.GetVersion() { @@ -677,6 +679,7 @@ func (p *IncrementalProcessor) modelSync() { p.modelStore.UnlockModel(mv.name) continue } + p.modelStore.LockServer(s.Name) vs := v.ReplicaState() // Go through all replicas that can receive traffic @@ -716,7 +719,7 @@ func (p *IncrementalProcessor) modelSync() { replicaIdx, nil, serverReplicaExpectedState, - db.ModelReplicaState_UnloadEnvoyRequested, + db.ModelReplicaState_UnloadRequested, "", nil, ); err != nil { @@ -724,6 +727,8 @@ func (p *IncrementalProcessor) modelSync() { mv.name, store.UnloadRequested.String(), serverReplicaExpectedState.String()) } } + + p.modelStore.UnlockServer(s.Name) p.modelStore.UnlockModel(mv.name) p.callVersionCleanupIfNeeded(m.Name) } diff --git a/scheduler/pkg/scheduler/filters/replicamemory.go b/scheduler/pkg/scheduler/filters/replicamemory.go index d7e793dd17..cd5334a12b 100644 --- a/scheduler/pkg/scheduler/filters/replicamemory.go +++ b/scheduler/pkg/scheduler/filters/replicamemory.go @@ -35,5 +35,6 @@ func (r AvailableMemoryReplicaFilter) Filter(model *db.ModelVersion, replica *db } func (r AvailableMemoryReplicaFilter) Description(model *db.ModelVersion, replica *db.ServerReplica) string { - return fmt.Sprintf("model memory %d replica memory %d", model.GetRequiredMemory(), replica.GetAvailableMemory()) + return fmt.Sprintf("model memory %d replica memory %d replica reserved memory %d", + model.GetRequiredMemory(), replica.GetAvailableMemory(), replica.GetReservedMemory()) } diff --git a/scheduler/pkg/server/server.go b/scheduler/pkg/server/server.go index 3c81f7b3e7..c6ca5d87be 100644 --- a/scheduler/pkg/server/server.go +++ b/scheduler/pkg/server/server.go @@ -474,6 +474,9 @@ func (s *SchedulerServer) rescheduleModels(serverKey string) { logger.WithError(err).Errorf("Failed to get server %s", serverKey) return } + s.modelStore.LockServer(server.Name) + defer s.modelStore.UnlockServer(server.Name) + models := make(map[string]bool) for _, replica := range server.Replicas { for _, model := range replica.GetLoadedOrLoadingModelVersions() { diff --git a/scheduler/pkg/store/api.go b/scheduler/pkg/store/api.go index 12a1c2b3bf..a621d3e4ce 100644 --- a/scheduler/pkg/store/api.go +++ b/scheduler/pkg/store/api.go @@ -143,6 +143,8 @@ type ModelServerAPI interface { GetModels() ([]*db.Model, error) LockModel(modelId string) UnlockModel(modelId string) + LockServer(serverID string) + UnlockServer(serverID string) RemoveModel(req *pb.UnloadModelRequest) error GetServers(shallow bool, modelDetails bool) ([]*db.Server, error) GetServer(serverKey string, shallow bool, modelDetails bool) (*db.Server, *ServerStats, error) diff --git a/scheduler/pkg/store/memory.go b/scheduler/pkg/store/memory.go index 9b4f5725de..a83b17fc89 100644 --- a/scheduler/pkg/store/memory.go +++ b/scheduler/pkg/store/memory.go @@ -207,15 +207,39 @@ func (m *ModelServerStore) deepCopy(model *Model, key string) *ModelSnapshot { return snapshot } +func (m *ModelServerStore) LockServer(serverId string) { + var lock sync.RWMutex + existingLock, _ := m.opLocks.LoadOrStore(serverLockID(serverId), &lock) + existingLock.(*sync.RWMutex).Lock() +} + +func modelLockID(modelId string) string { + return fmt.Sprintf("model_%s", modelId) +} + +func serverLockID(serverID string) string { + return fmt.Sprintf("server_%s", serverID) +} + func (m *ModelServerStore) LockModel(modelId string) { var lock sync.RWMutex - existingLock, _ := m.opLocks.LoadOrStore(modelId, &lock) + existingLock, _ := m.opLocks.LoadOrStore(modelLockID(modelId), &lock) existingLock.(*sync.RWMutex).Lock() } +func (m *ModelServerStore) UnlockServer(serverId string) { + logger := m.logger.WithField("func", "UnlockServer") + lock, loaded := m.opLocks.Load(serverLockID(serverId)) + if loaded { + lock.(*sync.RWMutex).Unlock() + } else { + logger.Warnf("Trying to unlock server %s that was not locked.", serverId) + } +} + func (m *ModelServerStore) UnlockModel(modelId string) { logger := m.logger.WithField("func", "UnlockModel") - lock, loaded := m.opLocks.Load(modelId) + lock, loaded := m.opLocks.Load(modelLockID(modelId)) if loaded { lock.(*sync.RWMutex).Unlock() } else { @@ -1137,6 +1161,9 @@ func (m *ModelServerStore) setModelGwModelStateImpl(name string, versionNumber u } func (m *ModelServerStore) EmitEvents() error { + m.mu.Lock() + defer m.mu.Unlock() + servers, err := m.store.servers.List(context.TODO()) if err != nil { return err From 2c9e98b82f7a5238e39c4d367e7b6f999bac6d51 Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Tue, 6 Jan 2026 16:45:21 +0000 Subject: [PATCH 17/50] lock should be before getting server to avoid stale data --- scheduler/pkg/envoy/processor/incremental.go | 7 ++++--- scheduler/pkg/server/server.go | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/scheduler/pkg/envoy/processor/incremental.go b/scheduler/pkg/envoy/processor/incremental.go index c7515284b7..874b33686d 100644 --- a/scheduler/pkg/envoy/processor/incremental.go +++ b/scheduler/pkg/envoy/processor/incremental.go @@ -256,12 +256,12 @@ func (p *IncrementalProcessor) addModelTraffic(routeName string, model *db.Model return fmt.Errorf("no live replica for model %s for model route %s", model.Name, routeName) } + p.modelStore.LockServer(latestModel.Server) + defer p.modelStore.UnlockServer(latestModel.Server) server, _, err := p.modelStore.GetServer(latestModel.Server, false, false) if err != nil { return err } - p.modelStore.LockServer(server.Name) - defer p.modelStore.UnlockServer(server.Name) lastAvailableModelVersion := model.GetLastAvailableModel() if lastAvailableModelVersion != nil && latestModel.GetVersion() != lastAvailableModelVersion.GetVersion() { @@ -673,13 +673,14 @@ func (p *IncrementalProcessor) modelSync() { continue } + p.modelStore.LockServer(v.Server) s, _, err := p.modelStore.GetServer(v.Server, false, false) if err != nil { logger.Debugf("Failed to get server for model %s server %s", mv.name, v.Server) + p.modelStore.UnlockServer(v.Server) p.modelStore.UnlockModel(mv.name) continue } - p.modelStore.LockServer(s.Name) vs := v.ReplicaState() // Go through all replicas that can receive traffic diff --git a/scheduler/pkg/server/server.go b/scheduler/pkg/server/server.go index c6ca5d87be..dc45680ba6 100644 --- a/scheduler/pkg/server/server.go +++ b/scheduler/pkg/server/server.go @@ -469,14 +469,15 @@ func (s *SchedulerServer) HealthCheck(_ context.Context, _ *health.HealthCheckRe func (s *SchedulerServer) rescheduleModels(serverKey string) { logger := s.logger.WithField("func", "rescheduleModels") + + s.modelStore.LockServer(serverKey) + defer s.modelStore.UnlockServer(serverKey) + server, _, err := s.modelStore.GetServer(serverKey, false, true) if err != nil { logger.WithError(err).Errorf("Failed to get server %s", serverKey) return } - s.modelStore.LockServer(server.Name) - defer s.modelStore.UnlockServer(server.Name) - models := make(map[string]bool) for _, replica := range server.Replicas { for _, model := range replica.GetLoadedOrLoadingModelVersions() { From 551f88c4886c3b24bb90f30b3febabda14d562aa Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Wed, 7 Jan 2026 17:39:43 +0000 Subject: [PATCH 18/50] use SetReplicas which checks for nil map and will initialise if empty --- scheduler/pkg/store/memory.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scheduler/pkg/store/memory.go b/scheduler/pkg/store/memory.go index a83b17fc89..a30122bbe6 100644 --- a/scheduler/pkg/store/memory.go +++ b/scheduler/pkg/store/memory.go @@ -844,7 +844,7 @@ func (m *ModelServerStore) addServerReplicaImpl(request *agent.AgentSubscribeReq if err != nil { return nil, coordinator.ServerEventMsg{}, err } - modelVersion.Replicas[int32(request.ReplicaIdx)] = &db.ReplicaStatus{State: db.ModelReplicaState_Loaded} + modelVersion.SetReplicaState(int(request.ReplicaIdx), db.ModelReplicaState_Loaded, "") modelVersion.Server = request.ServerName if err := m.updateModelStatus(true, false, modelVersion, model.GetLastAvailableModelVersion(), model); err != nil { return nil, coordinator.ServerEventMsg{}, fmt.Errorf("failed to update model status: %w", err) From 76bedab40dcbf18cf6a5fb404945b6da0c804066 Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Thu, 8 Jan 2026 09:29:30 +0000 Subject: [PATCH 19/50] fix adding a model which was previously deleted not loading onto replicas. Added proto for DB dump endpoint to aid debugging --- apis/Makefile | 1 + apis/go/mlops/scheduler/dbdump/db-dump.pb.go | 318 ++++++++++++++++++ .../mlops/scheduler/dbdump/db-dump_grpc.pb.go | 119 +++++++ apis/mlops/scheduler/db-dump.proto | 27 ++ scheduler/pkg/server/server.go | 1 + scheduler/pkg/store/in-memory/store.go | 3 +- scheduler/pkg/store/memory.go | 4 +- scheduler/pkg/store/storage.go | 3 +- 8 files changed, 471 insertions(+), 5 deletions(-) create mode 100644 apis/go/mlops/scheduler/dbdump/db-dump.pb.go create mode 100644 apis/go/mlops/scheduler/dbdump/db-dump_grpc.pb.go create mode 100644 apis/mlops/scheduler/db-dump.proto diff --git a/apis/Makefile b/apis/Makefile index 1a903a95df..6291b6cacb 100644 --- a/apis/Makefile +++ b/apis/Makefile @@ -163,6 +163,7 @@ build-go: \ ./mlops/proxy/proxy.proto \ ./mlops/scheduler/scheduler.proto \ ./mlops/scheduler/db/db.proto \ + ./mlops/scheduler/db-dump.proto \ ./mlops/chainer/chainer.proto \ ./mlops/v2_dataplane/v2_dataplane.proto \ ./mlops/health/health.proto \ diff --git a/apis/go/mlops/scheduler/dbdump/db-dump.pb.go b/apis/go/mlops/scheduler/dbdump/db-dump.pb.go new file mode 100644 index 0000000000..011040b0fa --- /dev/null +++ b/apis/go/mlops/scheduler/dbdump/db-dump.pb.go @@ -0,0 +1,318 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed BY +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.34.2 +// protoc v5.27.2 +// source: mlops/scheduler/db-dump.proto + +package dbdump + +import ( + db "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" + reflect "reflect" + sync "sync" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +type DatabaseDumpRequest_DatabaseDumpRecords int32 + +const ( + DatabaseDumpRequest_DUMP_UNKNOWN DatabaseDumpRequest_DatabaseDumpRecords = 0 + DatabaseDumpRequest_DUMP_ALL DatabaseDumpRequest_DatabaseDumpRecords = 1 + DatabaseDumpRequest_DUMP_MODELS DatabaseDumpRequest_DatabaseDumpRecords = 2 + DatabaseDumpRequest_DUMP_SERVERS DatabaseDumpRequest_DatabaseDumpRecords = 3 +) + +// Enum value maps for DatabaseDumpRequest_DatabaseDumpRecords. +var ( + DatabaseDumpRequest_DatabaseDumpRecords_name = map[int32]string{ + 0: "DUMP_UNKNOWN", + 1: "DUMP_ALL", + 2: "DUMP_MODELS", + 3: "DUMP_SERVERS", + } + DatabaseDumpRequest_DatabaseDumpRecords_value = map[string]int32{ + "DUMP_UNKNOWN": 0, + "DUMP_ALL": 1, + "DUMP_MODELS": 2, + "DUMP_SERVERS": 3, + } +) + +func (x DatabaseDumpRequest_DatabaseDumpRecords) Enum() *DatabaseDumpRequest_DatabaseDumpRecords { + p := new(DatabaseDumpRequest_DatabaseDumpRecords) + *p = x + return p +} + +func (x DatabaseDumpRequest_DatabaseDumpRecords) String() string { + return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) +} + +func (DatabaseDumpRequest_DatabaseDumpRecords) Descriptor() protoreflect.EnumDescriptor { + return file_mlops_scheduler_db_dump_proto_enumTypes[0].Descriptor() +} + +func (DatabaseDumpRequest_DatabaseDumpRecords) Type() protoreflect.EnumType { + return &file_mlops_scheduler_db_dump_proto_enumTypes[0] +} + +func (x DatabaseDumpRequest_DatabaseDumpRecords) Number() protoreflect.EnumNumber { + return protoreflect.EnumNumber(x) +} + +// Deprecated: Use DatabaseDumpRequest_DatabaseDumpRecords.Descriptor instead. +func (DatabaseDumpRequest_DatabaseDumpRecords) EnumDescriptor() ([]byte, []int) { + return file_mlops_scheduler_db_dump_proto_rawDescGZIP(), []int{0, 0} +} + +type DatabaseDumpRequest struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Action DatabaseDumpRequest_DatabaseDumpRecords `protobuf:"varint,1,opt,name=action,proto3,enum=seldon.mlops.scheduler.dbdump.DatabaseDumpRequest_DatabaseDumpRecords" json:"action,omitempty"` +} + +func (x *DatabaseDumpRequest) Reset() { + *x = DatabaseDumpRequest{} + if protoimpl.UnsafeEnabled { + mi := &file_mlops_scheduler_db_dump_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *DatabaseDumpRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*DatabaseDumpRequest) ProtoMessage() {} + +func (x *DatabaseDumpRequest) ProtoReflect() protoreflect.Message { + mi := &file_mlops_scheduler_db_dump_proto_msgTypes[0] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use DatabaseDumpRequest.ProtoReflect.Descriptor instead. +func (*DatabaseDumpRequest) Descriptor() ([]byte, []int) { + return file_mlops_scheduler_db_dump_proto_rawDescGZIP(), []int{0} +} + +func (x *DatabaseDumpRequest) GetAction() DatabaseDumpRequest_DatabaseDumpRecords { + if x != nil { + return x.Action + } + return DatabaseDumpRequest_DUMP_UNKNOWN +} + +type DatabaseDumpResponse struct { + state protoimpl.MessageState + sizeCache protoimpl.SizeCache + unknownFields protoimpl.UnknownFields + + Model []*db.Model `protobuf:"bytes,1,rep,name=model,proto3" json:"model,omitempty"` + Server []*db.Server `protobuf:"bytes,2,rep,name=server,proto3" json:"server,omitempty"` +} + +func (x *DatabaseDumpResponse) Reset() { + *x = DatabaseDumpResponse{} + if protoimpl.UnsafeEnabled { + mi := &file_mlops_scheduler_db_dump_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) + } +} + +func (x *DatabaseDumpResponse) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*DatabaseDumpResponse) ProtoMessage() {} + +func (x *DatabaseDumpResponse) ProtoReflect() protoreflect.Message { + mi := &file_mlops_scheduler_db_dump_proto_msgTypes[1] + if protoimpl.UnsafeEnabled && x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use DatabaseDumpResponse.ProtoReflect.Descriptor instead. +func (*DatabaseDumpResponse) Descriptor() ([]byte, []int) { + return file_mlops_scheduler_db_dump_proto_rawDescGZIP(), []int{1} +} + +func (x *DatabaseDumpResponse) GetModel() []*db.Model { + if x != nil { + return x.Model + } + return nil +} + +func (x *DatabaseDumpResponse) GetServer() []*db.Server { + if x != nil { + return x.Server + } + return nil +} + +var File_mlops_scheduler_db_dump_proto protoreflect.FileDescriptor + +var file_mlops_scheduler_db_dump_proto_rawDesc = []byte{ + 0x0a, 0x1d, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2f, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, + 0x72, 0x2f, 0x64, 0x62, 0x2d, 0x64, 0x75, 0x6d, 0x70, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, + 0x1d, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, + 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, 0x62, 0x64, 0x75, 0x6d, 0x70, 0x1a, 0x1b, + 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2f, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2f, + 0x64, 0x62, 0x2f, 0x64, 0x62, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x22, 0xcf, 0x01, 0x0a, 0x13, + 0x44, 0x61, 0x74, 0x61, 0x62, 0x61, 0x73, 0x65, 0x44, 0x75, 0x6d, 0x70, 0x52, 0x65, 0x71, 0x75, + 0x65, 0x73, 0x74, 0x12, 0x5e, 0x0a, 0x06, 0x61, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x18, 0x01, 0x20, + 0x01, 0x28, 0x0e, 0x32, 0x46, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, + 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, 0x62, 0x64, + 0x75, 0x6d, 0x70, 0x2e, 0x44, 0x61, 0x74, 0x61, 0x62, 0x61, 0x73, 0x65, 0x44, 0x75, 0x6d, 0x70, + 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x2e, 0x44, 0x61, 0x74, 0x61, 0x62, 0x61, 0x73, 0x65, + 0x44, 0x75, 0x6d, 0x70, 0x52, 0x65, 0x63, 0x6f, 0x72, 0x64, 0x73, 0x52, 0x06, 0x61, 0x63, 0x74, + 0x69, 0x6f, 0x6e, 0x22, 0x58, 0x0a, 0x13, 0x44, 0x61, 0x74, 0x61, 0x62, 0x61, 0x73, 0x65, 0x44, + 0x75, 0x6d, 0x70, 0x52, 0x65, 0x63, 0x6f, 0x72, 0x64, 0x73, 0x12, 0x10, 0x0a, 0x0c, 0x44, 0x55, + 0x4d, 0x50, 0x5f, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x00, 0x12, 0x0c, 0x0a, 0x08, + 0x44, 0x55, 0x4d, 0x50, 0x5f, 0x41, 0x4c, 0x4c, 0x10, 0x01, 0x12, 0x0f, 0x0a, 0x0b, 0x44, 0x55, + 0x4d, 0x50, 0x5f, 0x4d, 0x4f, 0x44, 0x45, 0x4c, 0x53, 0x10, 0x02, 0x12, 0x10, 0x0a, 0x0c, 0x44, + 0x55, 0x4d, 0x50, 0x5f, 0x53, 0x45, 0x52, 0x56, 0x45, 0x52, 0x53, 0x10, 0x03, 0x22, 0x89, 0x01, + 0x0a, 0x14, 0x44, 0x61, 0x74, 0x61, 0x62, 0x61, 0x73, 0x65, 0x44, 0x75, 0x6d, 0x70, 0x52, 0x65, + 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x36, 0x0a, 0x05, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x18, + 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x20, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, + 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, + 0x62, 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x52, 0x05, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x12, 0x39, + 0x0a, 0x06, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x21, + 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, + 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, 0x62, 0x2e, 0x53, 0x65, 0x72, 0x76, 0x65, + 0x72, 0x52, 0x06, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x32, 0x8c, 0x01, 0x0a, 0x0f, 0x44, 0x61, + 0x74, 0x61, 0x62, 0x61, 0x73, 0x65, 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x12, 0x79, 0x0a, + 0x0c, 0x44, 0x75, 0x6d, 0x70, 0x44, 0x61, 0x74, 0x61, 0x62, 0x61, 0x73, 0x65, 0x12, 0x32, 0x2e, + 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, + 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, 0x62, 0x64, 0x75, 0x6d, 0x70, 0x2e, 0x44, 0x61, + 0x74, 0x61, 0x62, 0x61, 0x73, 0x65, 0x44, 0x75, 0x6d, 0x70, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, + 0x74, 0x1a, 0x33, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, + 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, 0x62, 0x64, 0x75, 0x6d, + 0x70, 0x2e, 0x44, 0x61, 0x74, 0x61, 0x62, 0x61, 0x73, 0x65, 0x44, 0x75, 0x6d, 0x70, 0x52, 0x65, + 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x42, 0x43, 0x5a, 0x41, 0x67, 0x69, 0x74, 0x68, + 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x69, 0x6f, 0x2f, + 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2d, 0x63, 0x6f, 0x72, 0x65, 0x2f, 0x61, 0x70, 0x69, 0x73, + 0x2f, 0x67, 0x6f, 0x2f, 0x76, 0x32, 0x2f, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2f, 0x73, 0x63, 0x68, + 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2f, 0x64, 0x62, 0x64, 0x75, 0x6d, 0x70, 0x62, 0x06, 0x70, + 0x72, 0x6f, 0x74, 0x6f, 0x33, +} + +var ( + file_mlops_scheduler_db_dump_proto_rawDescOnce sync.Once + file_mlops_scheduler_db_dump_proto_rawDescData = file_mlops_scheduler_db_dump_proto_rawDesc +) + +func file_mlops_scheduler_db_dump_proto_rawDescGZIP() []byte { + file_mlops_scheduler_db_dump_proto_rawDescOnce.Do(func() { + file_mlops_scheduler_db_dump_proto_rawDescData = protoimpl.X.CompressGZIP(file_mlops_scheduler_db_dump_proto_rawDescData) + }) + return file_mlops_scheduler_db_dump_proto_rawDescData +} + +var file_mlops_scheduler_db_dump_proto_enumTypes = make([]protoimpl.EnumInfo, 1) +var file_mlops_scheduler_db_dump_proto_msgTypes = make([]protoimpl.MessageInfo, 2) +var file_mlops_scheduler_db_dump_proto_goTypes = []any{ + (DatabaseDumpRequest_DatabaseDumpRecords)(0), // 0: seldon.mlops.scheduler.dbdump.DatabaseDumpRequest.DatabaseDumpRecords + (*DatabaseDumpRequest)(nil), // 1: seldon.mlops.scheduler.dbdump.DatabaseDumpRequest + (*DatabaseDumpResponse)(nil), // 2: seldon.mlops.scheduler.dbdump.DatabaseDumpResponse + (*db.Model)(nil), // 3: seldon.mlops.scheduler.db.Model + (*db.Server)(nil), // 4: seldon.mlops.scheduler.db.Server +} +var file_mlops_scheduler_db_dump_proto_depIdxs = []int32{ + 0, // 0: seldon.mlops.scheduler.dbdump.DatabaseDumpRequest.action:type_name -> seldon.mlops.scheduler.dbdump.DatabaseDumpRequest.DatabaseDumpRecords + 3, // 1: seldon.mlops.scheduler.dbdump.DatabaseDumpResponse.model:type_name -> seldon.mlops.scheduler.db.Model + 4, // 2: seldon.mlops.scheduler.dbdump.DatabaseDumpResponse.server:type_name -> seldon.mlops.scheduler.db.Server + 1, // 3: seldon.mlops.scheduler.dbdump.DatabaseService.DumpDatabase:input_type -> seldon.mlops.scheduler.dbdump.DatabaseDumpRequest + 2, // 4: seldon.mlops.scheduler.dbdump.DatabaseService.DumpDatabase:output_type -> seldon.mlops.scheduler.dbdump.DatabaseDumpResponse + 4, // [4:5] is the sub-list for method output_type + 3, // [3:4] is the sub-list for method input_type + 3, // [3:3] is the sub-list for extension type_name + 3, // [3:3] is the sub-list for extension extendee + 0, // [0:3] is the sub-list for field type_name +} + +func init() { file_mlops_scheduler_db_dump_proto_init() } +func file_mlops_scheduler_db_dump_proto_init() { + if File_mlops_scheduler_db_dump_proto != nil { + return + } + if !protoimpl.UnsafeEnabled { + file_mlops_scheduler_db_dump_proto_msgTypes[0].Exporter = func(v any, i int) any { + switch v := v.(*DatabaseDumpRequest); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + file_mlops_scheduler_db_dump_proto_msgTypes[1].Exporter = func(v any, i int) any { + switch v := v.(*DatabaseDumpResponse); i { + case 0: + return &v.state + case 1: + return &v.sizeCache + case 2: + return &v.unknownFields + default: + return nil + } + } + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: file_mlops_scheduler_db_dump_proto_rawDesc, + NumEnums: 1, + NumMessages: 2, + NumExtensions: 0, + NumServices: 1, + }, + GoTypes: file_mlops_scheduler_db_dump_proto_goTypes, + DependencyIndexes: file_mlops_scheduler_db_dump_proto_depIdxs, + EnumInfos: file_mlops_scheduler_db_dump_proto_enumTypes, + MessageInfos: file_mlops_scheduler_db_dump_proto_msgTypes, + }.Build() + File_mlops_scheduler_db_dump_proto = out.File + file_mlops_scheduler_db_dump_proto_rawDesc = nil + file_mlops_scheduler_db_dump_proto_goTypes = nil + file_mlops_scheduler_db_dump_proto_depIdxs = nil +} diff --git a/apis/go/mlops/scheduler/dbdump/db-dump_grpc.pb.go b/apis/go/mlops/scheduler/dbdump/db-dump_grpc.pb.go new file mode 100644 index 0000000000..eddefe420d --- /dev/null +++ b/apis/go/mlops/scheduler/dbdump/db-dump_grpc.pb.go @@ -0,0 +1,119 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed BY +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + +// Code generated by protoc-gen-go-grpc. DO NOT EDIT. +// versions: +// - protoc-gen-go-grpc v1.4.0 +// - protoc v5.27.2 +// source: mlops/scheduler/db-dump.proto + +package dbdump + +import ( + context "context" + grpc "google.golang.org/grpc" + codes "google.golang.org/grpc/codes" + status "google.golang.org/grpc/status" +) + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the grpc package it is being compiled against. +// Requires gRPC-Go v1.62.0 or later. +const _ = grpc.SupportPackageIsVersion8 + +const ( + DatabaseService_DumpDatabase_FullMethodName = "/seldon.mlops.scheduler.dbdump.DatabaseService/DumpDatabase" +) + +// DatabaseServiceClient is the client API for DatabaseService service. +// +// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. +type DatabaseServiceClient interface { + DumpDatabase(ctx context.Context, in *DatabaseDumpRequest, opts ...grpc.CallOption) (*DatabaseDumpResponse, error) +} + +type databaseServiceClient struct { + cc grpc.ClientConnInterface +} + +func NewDatabaseServiceClient(cc grpc.ClientConnInterface) DatabaseServiceClient { + return &databaseServiceClient{cc} +} + +func (c *databaseServiceClient) DumpDatabase(ctx context.Context, in *DatabaseDumpRequest, opts ...grpc.CallOption) (*DatabaseDumpResponse, error) { + cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...) + out := new(DatabaseDumpResponse) + err := c.cc.Invoke(ctx, DatabaseService_DumpDatabase_FullMethodName, in, out, cOpts...) + if err != nil { + return nil, err + } + return out, nil +} + +// DatabaseServiceServer is the server API for DatabaseService service. +// All implementations must embed UnimplementedDatabaseServiceServer +// for forward compatibility +type DatabaseServiceServer interface { + DumpDatabase(context.Context, *DatabaseDumpRequest) (*DatabaseDumpResponse, error) + mustEmbedUnimplementedDatabaseServiceServer() +} + +// UnimplementedDatabaseServiceServer must be embedded to have forward compatible implementations. +type UnimplementedDatabaseServiceServer struct { +} + +func (UnimplementedDatabaseServiceServer) DumpDatabase(context.Context, *DatabaseDumpRequest) (*DatabaseDumpResponse, error) { + return nil, status.Errorf(codes.Unimplemented, "method DumpDatabase not implemented") +} +func (UnimplementedDatabaseServiceServer) mustEmbedUnimplementedDatabaseServiceServer() {} + +// UnsafeDatabaseServiceServer may be embedded to opt out of forward compatibility for this service. +// Use of this interface is not recommended, as added methods to DatabaseServiceServer will +// result in compilation errors. +type UnsafeDatabaseServiceServer interface { + mustEmbedUnimplementedDatabaseServiceServer() +} + +func RegisterDatabaseServiceServer(s grpc.ServiceRegistrar, srv DatabaseServiceServer) { + s.RegisterService(&DatabaseService_ServiceDesc, srv) +} + +func _DatabaseService_DumpDatabase_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { + in := new(DatabaseDumpRequest) + if err := dec(in); err != nil { + return nil, err + } + if interceptor == nil { + return srv.(DatabaseServiceServer).DumpDatabase(ctx, in) + } + info := &grpc.UnaryServerInfo{ + Server: srv, + FullMethod: DatabaseService_DumpDatabase_FullMethodName, + } + handler := func(ctx context.Context, req interface{}) (interface{}, error) { + return srv.(DatabaseServiceServer).DumpDatabase(ctx, req.(*DatabaseDumpRequest)) + } + return interceptor(ctx, in, info, handler) +} + +// DatabaseService_ServiceDesc is the grpc.ServiceDesc for DatabaseService service. +// It's only intended for direct use with grpc.RegisterService, +// and not to be introspected or modified (even as a copy) +var DatabaseService_ServiceDesc = grpc.ServiceDesc{ + ServiceName: "seldon.mlops.scheduler.dbdump.DatabaseService", + HandlerType: (*DatabaseServiceServer)(nil), + Methods: []grpc.MethodDesc{ + { + MethodName: "DumpDatabase", + Handler: _DatabaseService_DumpDatabase_Handler, + }, + }, + Streams: []grpc.StreamDesc{}, + Metadata: "mlops/scheduler/db-dump.proto", +} diff --git a/apis/mlops/scheduler/db-dump.proto b/apis/mlops/scheduler/db-dump.proto new file mode 100644 index 0000000000..7b8b5613c8 --- /dev/null +++ b/apis/mlops/scheduler/db-dump.proto @@ -0,0 +1,27 @@ +syntax = "proto3"; + +package seldon.mlops.scheduler.dbdump; + +option go_package = "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/dbdump"; + +import "mlops/scheduler/db/db.proto"; + +message DatabaseDumpRequest { + enum DatabaseDumpRecords { + DUMP_UNKNOWN = 0; + DUMP_ALL = 1; + DUMP_MODELS = 2; + DUMP_SERVERS = 3; + } + DatabaseDumpRecords action = 1; +} + +message DatabaseDumpResponse { + repeated db.Model model = 1; + repeated db.Server server = 2; +} + + +service DatabaseService { + rpc DumpDatabase(DatabaseDumpRequest) returns (DatabaseDumpResponse) {}; +} \ No newline at end of file diff --git a/scheduler/pkg/server/server.go b/scheduler/pkg/server/server.go index dc45680ba6..ca2c9ac250 100644 --- a/scheduler/pkg/server/server.go +++ b/scheduler/pkg/server/server.go @@ -497,6 +497,7 @@ func (s *SchedulerServer) LoadModel(ctx context.Context, req *pb.LoadModelReques logger.Debugf("Load model %+v k8s meta %+v", req.GetModel().GetMeta(), req.GetModel().GetMeta().GetKubernetesMeta()) err := s.modelStore.UpdateModel(req) if err != nil { + logger.WithError(err).WithField("req", req).Error("Failed to update model") return nil, status.Errorf(codes.FailedPrecondition, "%s", err.Error()) } go func() { diff --git a/scheduler/pkg/store/in-memory/store.go b/scheduler/pkg/store/in-memory/store.go index d42c74c7ad..c1d194c16f 100644 --- a/scheduler/pkg/store/in-memory/store.go +++ b/scheduler/pkg/store/in-memory/store.go @@ -11,7 +11,6 @@ package in_memory import ( "context" - "fmt" "sync" "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" @@ -56,7 +55,7 @@ func (s *Storage[T]) Insert(_ context.Context, record T) error { defer s.mu.Unlock() if _, ok := s.records[record.GetName()]; ok { - return fmt.Errorf("record with name %s already exists", record.GetName()) + return store.ErrAlreadyExists } s.records[record.GetName()] = proto.Clone(record).(T) diff --git a/scheduler/pkg/store/memory.go b/scheduler/pkg/store/memory.go index a30122bbe6..cb089c16e8 100644 --- a/scheduler/pkg/store/memory.go +++ b/scheduler/pkg/store/memory.go @@ -129,7 +129,7 @@ func (m *ModelServerStore) UpdateModel(req *pb.LoadModelRequest) error { validName := utils.CheckName(modelName) if !validName { return fmt.Errorf( - "Model %s does not have a valid name - it must be alphanumeric and not contains dots (.)", + "model %s does not have a valid name - it must be alphanumeric and not contains dots (.)", modelName, ) } @@ -148,6 +148,7 @@ func (m *ModelServerStore) UpdateModel(req *pb.LoadModelRequest) error { if model.Deleted { if model.Inactive() { + model := &db.Model{Name: modelName} m.addNextModelVersion(model, req.GetModel()) if err := m.store.models.Update(context.TODO(), model); err != nil { return fmt.Errorf("failed to update model %s: %w", modelName, err) @@ -158,7 +159,6 @@ func (m *ModelServerStore) UpdateModel(req *pb.LoadModelRequest) error { "model %s is in process of deletion - new model can not be created", modelName, ) - } latestModel := model.Latest() diff --git a/scheduler/pkg/store/storage.go b/scheduler/pkg/store/storage.go index 1954da8d5b..9eb8af9d5f 100644 --- a/scheduler/pkg/store/storage.go +++ b/scheduler/pkg/store/storage.go @@ -17,7 +17,8 @@ import ( ) var ( - ErrNotFound = errors.New("record not found") + ErrNotFound = errors.New("record not found") + ErrAlreadyExists = errors.New("record already exists") ) type Storage[T proto.Message] interface { From 14fc9063e283aa64a2d19fa5594e27e052866d05 Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Thu, 8 Jan 2026 13:41:38 +0000 Subject: [PATCH 20/50] remove DB dump proto for separate PR --- apis/Makefile | 1 - apis/go/mlops/scheduler/dbdump/db-dump.pb.go | 318 ------------------ .../mlops/scheduler/dbdump/db-dump_grpc.pb.go | 119 ------- apis/mlops/scheduler/db-dump.proto | 27 -- 4 files changed, 465 deletions(-) delete mode 100644 apis/go/mlops/scheduler/dbdump/db-dump.pb.go delete mode 100644 apis/go/mlops/scheduler/dbdump/db-dump_grpc.pb.go delete mode 100644 apis/mlops/scheduler/db-dump.proto diff --git a/apis/Makefile b/apis/Makefile index 6291b6cacb..1a903a95df 100644 --- a/apis/Makefile +++ b/apis/Makefile @@ -163,7 +163,6 @@ build-go: \ ./mlops/proxy/proxy.proto \ ./mlops/scheduler/scheduler.proto \ ./mlops/scheduler/db/db.proto \ - ./mlops/scheduler/db-dump.proto \ ./mlops/chainer/chainer.proto \ ./mlops/v2_dataplane/v2_dataplane.proto \ ./mlops/health/health.proto \ diff --git a/apis/go/mlops/scheduler/dbdump/db-dump.pb.go b/apis/go/mlops/scheduler/dbdump/db-dump.pb.go deleted file mode 100644 index 011040b0fa..0000000000 --- a/apis/go/mlops/scheduler/dbdump/db-dump.pb.go +++ /dev/null @@ -1,318 +0,0 @@ -/* -Copyright (c) 2024 Seldon Technologies Ltd. - -Use of this software is governed BY -(1) the license included in the LICENSE file or -(2) if the license included in the LICENSE file is the Business Source License 1.1, -the Change License after the Change Date as each is defined in accordance with the LICENSE file. -*/ - -// Code generated by protoc-gen-go. DO NOT EDIT. -// versions: -// protoc-gen-go v1.34.2 -// protoc v5.27.2 -// source: mlops/scheduler/db-dump.proto - -package dbdump - -import ( - db "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" - protoreflect "google.golang.org/protobuf/reflect/protoreflect" - protoimpl "google.golang.org/protobuf/runtime/protoimpl" - reflect "reflect" - sync "sync" -) - -const ( - // Verify that this generated code is sufficiently up-to-date. - _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) - // Verify that runtime/protoimpl is sufficiently up-to-date. - _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) -) - -type DatabaseDumpRequest_DatabaseDumpRecords int32 - -const ( - DatabaseDumpRequest_DUMP_UNKNOWN DatabaseDumpRequest_DatabaseDumpRecords = 0 - DatabaseDumpRequest_DUMP_ALL DatabaseDumpRequest_DatabaseDumpRecords = 1 - DatabaseDumpRequest_DUMP_MODELS DatabaseDumpRequest_DatabaseDumpRecords = 2 - DatabaseDumpRequest_DUMP_SERVERS DatabaseDumpRequest_DatabaseDumpRecords = 3 -) - -// Enum value maps for DatabaseDumpRequest_DatabaseDumpRecords. -var ( - DatabaseDumpRequest_DatabaseDumpRecords_name = map[int32]string{ - 0: "DUMP_UNKNOWN", - 1: "DUMP_ALL", - 2: "DUMP_MODELS", - 3: "DUMP_SERVERS", - } - DatabaseDumpRequest_DatabaseDumpRecords_value = map[string]int32{ - "DUMP_UNKNOWN": 0, - "DUMP_ALL": 1, - "DUMP_MODELS": 2, - "DUMP_SERVERS": 3, - } -) - -func (x DatabaseDumpRequest_DatabaseDumpRecords) Enum() *DatabaseDumpRequest_DatabaseDumpRecords { - p := new(DatabaseDumpRequest_DatabaseDumpRecords) - *p = x - return p -} - -func (x DatabaseDumpRequest_DatabaseDumpRecords) String() string { - return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) -} - -func (DatabaseDumpRequest_DatabaseDumpRecords) Descriptor() protoreflect.EnumDescriptor { - return file_mlops_scheduler_db_dump_proto_enumTypes[0].Descriptor() -} - -func (DatabaseDumpRequest_DatabaseDumpRecords) Type() protoreflect.EnumType { - return &file_mlops_scheduler_db_dump_proto_enumTypes[0] -} - -func (x DatabaseDumpRequest_DatabaseDumpRecords) Number() protoreflect.EnumNumber { - return protoreflect.EnumNumber(x) -} - -// Deprecated: Use DatabaseDumpRequest_DatabaseDumpRecords.Descriptor instead. -func (DatabaseDumpRequest_DatabaseDumpRecords) EnumDescriptor() ([]byte, []int) { - return file_mlops_scheduler_db_dump_proto_rawDescGZIP(), []int{0, 0} -} - -type DatabaseDumpRequest struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - Action DatabaseDumpRequest_DatabaseDumpRecords `protobuf:"varint,1,opt,name=action,proto3,enum=seldon.mlops.scheduler.dbdump.DatabaseDumpRequest_DatabaseDumpRecords" json:"action,omitempty"` -} - -func (x *DatabaseDumpRequest) Reset() { - *x = DatabaseDumpRequest{} - if protoimpl.UnsafeEnabled { - mi := &file_mlops_scheduler_db_dump_proto_msgTypes[0] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *DatabaseDumpRequest) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*DatabaseDumpRequest) ProtoMessage() {} - -func (x *DatabaseDumpRequest) ProtoReflect() protoreflect.Message { - mi := &file_mlops_scheduler_db_dump_proto_msgTypes[0] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use DatabaseDumpRequest.ProtoReflect.Descriptor instead. -func (*DatabaseDumpRequest) Descriptor() ([]byte, []int) { - return file_mlops_scheduler_db_dump_proto_rawDescGZIP(), []int{0} -} - -func (x *DatabaseDumpRequest) GetAction() DatabaseDumpRequest_DatabaseDumpRecords { - if x != nil { - return x.Action - } - return DatabaseDumpRequest_DUMP_UNKNOWN -} - -type DatabaseDumpResponse struct { - state protoimpl.MessageState - sizeCache protoimpl.SizeCache - unknownFields protoimpl.UnknownFields - - Model []*db.Model `protobuf:"bytes,1,rep,name=model,proto3" json:"model,omitempty"` - Server []*db.Server `protobuf:"bytes,2,rep,name=server,proto3" json:"server,omitempty"` -} - -func (x *DatabaseDumpResponse) Reset() { - *x = DatabaseDumpResponse{} - if protoimpl.UnsafeEnabled { - mi := &file_mlops_scheduler_db_dump_proto_msgTypes[1] - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - ms.StoreMessageInfo(mi) - } -} - -func (x *DatabaseDumpResponse) String() string { - return protoimpl.X.MessageStringOf(x) -} - -func (*DatabaseDumpResponse) ProtoMessage() {} - -func (x *DatabaseDumpResponse) ProtoReflect() protoreflect.Message { - mi := &file_mlops_scheduler_db_dump_proto_msgTypes[1] - if protoimpl.UnsafeEnabled && x != nil { - ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) - if ms.LoadMessageInfo() == nil { - ms.StoreMessageInfo(mi) - } - return ms - } - return mi.MessageOf(x) -} - -// Deprecated: Use DatabaseDumpResponse.ProtoReflect.Descriptor instead. -func (*DatabaseDumpResponse) Descriptor() ([]byte, []int) { - return file_mlops_scheduler_db_dump_proto_rawDescGZIP(), []int{1} -} - -func (x *DatabaseDumpResponse) GetModel() []*db.Model { - if x != nil { - return x.Model - } - return nil -} - -func (x *DatabaseDumpResponse) GetServer() []*db.Server { - if x != nil { - return x.Server - } - return nil -} - -var File_mlops_scheduler_db_dump_proto protoreflect.FileDescriptor - -var file_mlops_scheduler_db_dump_proto_rawDesc = []byte{ - 0x0a, 0x1d, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2f, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, - 0x72, 0x2f, 0x64, 0x62, 0x2d, 0x64, 0x75, 0x6d, 0x70, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, - 0x1d, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, - 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, 0x62, 0x64, 0x75, 0x6d, 0x70, 0x1a, 0x1b, - 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2f, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2f, - 0x64, 0x62, 0x2f, 0x64, 0x62, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x22, 0xcf, 0x01, 0x0a, 0x13, - 0x44, 0x61, 0x74, 0x61, 0x62, 0x61, 0x73, 0x65, 0x44, 0x75, 0x6d, 0x70, 0x52, 0x65, 0x71, 0x75, - 0x65, 0x73, 0x74, 0x12, 0x5e, 0x0a, 0x06, 0x61, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x18, 0x01, 0x20, - 0x01, 0x28, 0x0e, 0x32, 0x46, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, - 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, 0x62, 0x64, - 0x75, 0x6d, 0x70, 0x2e, 0x44, 0x61, 0x74, 0x61, 0x62, 0x61, 0x73, 0x65, 0x44, 0x75, 0x6d, 0x70, - 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x2e, 0x44, 0x61, 0x74, 0x61, 0x62, 0x61, 0x73, 0x65, - 0x44, 0x75, 0x6d, 0x70, 0x52, 0x65, 0x63, 0x6f, 0x72, 0x64, 0x73, 0x52, 0x06, 0x61, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x22, 0x58, 0x0a, 0x13, 0x44, 0x61, 0x74, 0x61, 0x62, 0x61, 0x73, 0x65, 0x44, - 0x75, 0x6d, 0x70, 0x52, 0x65, 0x63, 0x6f, 0x72, 0x64, 0x73, 0x12, 0x10, 0x0a, 0x0c, 0x44, 0x55, - 0x4d, 0x50, 0x5f, 0x55, 0x4e, 0x4b, 0x4e, 0x4f, 0x57, 0x4e, 0x10, 0x00, 0x12, 0x0c, 0x0a, 0x08, - 0x44, 0x55, 0x4d, 0x50, 0x5f, 0x41, 0x4c, 0x4c, 0x10, 0x01, 0x12, 0x0f, 0x0a, 0x0b, 0x44, 0x55, - 0x4d, 0x50, 0x5f, 0x4d, 0x4f, 0x44, 0x45, 0x4c, 0x53, 0x10, 0x02, 0x12, 0x10, 0x0a, 0x0c, 0x44, - 0x55, 0x4d, 0x50, 0x5f, 0x53, 0x45, 0x52, 0x56, 0x45, 0x52, 0x53, 0x10, 0x03, 0x22, 0x89, 0x01, - 0x0a, 0x14, 0x44, 0x61, 0x74, 0x61, 0x62, 0x61, 0x73, 0x65, 0x44, 0x75, 0x6d, 0x70, 0x52, 0x65, - 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x12, 0x36, 0x0a, 0x05, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x18, - 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x20, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, - 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, - 0x62, 0x2e, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x52, 0x05, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x12, 0x39, - 0x0a, 0x06, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x21, - 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, - 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, 0x62, 0x2e, 0x53, 0x65, 0x72, 0x76, 0x65, - 0x72, 0x52, 0x06, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x32, 0x8c, 0x01, 0x0a, 0x0f, 0x44, 0x61, - 0x74, 0x61, 0x62, 0x61, 0x73, 0x65, 0x53, 0x65, 0x72, 0x76, 0x69, 0x63, 0x65, 0x12, 0x79, 0x0a, - 0x0c, 0x44, 0x75, 0x6d, 0x70, 0x44, 0x61, 0x74, 0x61, 0x62, 0x61, 0x73, 0x65, 0x12, 0x32, 0x2e, - 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x63, 0x68, - 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, 0x62, 0x64, 0x75, 0x6d, 0x70, 0x2e, 0x44, 0x61, - 0x74, 0x61, 0x62, 0x61, 0x73, 0x65, 0x44, 0x75, 0x6d, 0x70, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, - 0x74, 0x1a, 0x33, 0x2e, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2e, 0x6d, 0x6c, 0x6f, 0x70, 0x73, - 0x2e, 0x73, 0x63, 0x68, 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2e, 0x64, 0x62, 0x64, 0x75, 0x6d, - 0x70, 0x2e, 0x44, 0x61, 0x74, 0x61, 0x62, 0x61, 0x73, 0x65, 0x44, 0x75, 0x6d, 0x70, 0x52, 0x65, - 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x22, 0x00, 0x42, 0x43, 0x5a, 0x41, 0x67, 0x69, 0x74, 0x68, - 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, 0x2f, 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x69, 0x6f, 0x2f, - 0x73, 0x65, 0x6c, 0x64, 0x6f, 0x6e, 0x2d, 0x63, 0x6f, 0x72, 0x65, 0x2f, 0x61, 0x70, 0x69, 0x73, - 0x2f, 0x67, 0x6f, 0x2f, 0x76, 0x32, 0x2f, 0x6d, 0x6c, 0x6f, 0x70, 0x73, 0x2f, 0x73, 0x63, 0x68, - 0x65, 0x64, 0x75, 0x6c, 0x65, 0x72, 0x2f, 0x64, 0x62, 0x64, 0x75, 0x6d, 0x70, 0x62, 0x06, 0x70, - 0x72, 0x6f, 0x74, 0x6f, 0x33, -} - -var ( - file_mlops_scheduler_db_dump_proto_rawDescOnce sync.Once - file_mlops_scheduler_db_dump_proto_rawDescData = file_mlops_scheduler_db_dump_proto_rawDesc -) - -func file_mlops_scheduler_db_dump_proto_rawDescGZIP() []byte { - file_mlops_scheduler_db_dump_proto_rawDescOnce.Do(func() { - file_mlops_scheduler_db_dump_proto_rawDescData = protoimpl.X.CompressGZIP(file_mlops_scheduler_db_dump_proto_rawDescData) - }) - return file_mlops_scheduler_db_dump_proto_rawDescData -} - -var file_mlops_scheduler_db_dump_proto_enumTypes = make([]protoimpl.EnumInfo, 1) -var file_mlops_scheduler_db_dump_proto_msgTypes = make([]protoimpl.MessageInfo, 2) -var file_mlops_scheduler_db_dump_proto_goTypes = []any{ - (DatabaseDumpRequest_DatabaseDumpRecords)(0), // 0: seldon.mlops.scheduler.dbdump.DatabaseDumpRequest.DatabaseDumpRecords - (*DatabaseDumpRequest)(nil), // 1: seldon.mlops.scheduler.dbdump.DatabaseDumpRequest - (*DatabaseDumpResponse)(nil), // 2: seldon.mlops.scheduler.dbdump.DatabaseDumpResponse - (*db.Model)(nil), // 3: seldon.mlops.scheduler.db.Model - (*db.Server)(nil), // 4: seldon.mlops.scheduler.db.Server -} -var file_mlops_scheduler_db_dump_proto_depIdxs = []int32{ - 0, // 0: seldon.mlops.scheduler.dbdump.DatabaseDumpRequest.action:type_name -> seldon.mlops.scheduler.dbdump.DatabaseDumpRequest.DatabaseDumpRecords - 3, // 1: seldon.mlops.scheduler.dbdump.DatabaseDumpResponse.model:type_name -> seldon.mlops.scheduler.db.Model - 4, // 2: seldon.mlops.scheduler.dbdump.DatabaseDumpResponse.server:type_name -> seldon.mlops.scheduler.db.Server - 1, // 3: seldon.mlops.scheduler.dbdump.DatabaseService.DumpDatabase:input_type -> seldon.mlops.scheduler.dbdump.DatabaseDumpRequest - 2, // 4: seldon.mlops.scheduler.dbdump.DatabaseService.DumpDatabase:output_type -> seldon.mlops.scheduler.dbdump.DatabaseDumpResponse - 4, // [4:5] is the sub-list for method output_type - 3, // [3:4] is the sub-list for method input_type - 3, // [3:3] is the sub-list for extension type_name - 3, // [3:3] is the sub-list for extension extendee - 0, // [0:3] is the sub-list for field type_name -} - -func init() { file_mlops_scheduler_db_dump_proto_init() } -func file_mlops_scheduler_db_dump_proto_init() { - if File_mlops_scheduler_db_dump_proto != nil { - return - } - if !protoimpl.UnsafeEnabled { - file_mlops_scheduler_db_dump_proto_msgTypes[0].Exporter = func(v any, i int) any { - switch v := v.(*DatabaseDumpRequest); i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - file_mlops_scheduler_db_dump_proto_msgTypes[1].Exporter = func(v any, i int) any { - switch v := v.(*DatabaseDumpResponse); i { - case 0: - return &v.state - case 1: - return &v.sizeCache - case 2: - return &v.unknownFields - default: - return nil - } - } - } - type x struct{} - out := protoimpl.TypeBuilder{ - File: protoimpl.DescBuilder{ - GoPackagePath: reflect.TypeOf(x{}).PkgPath(), - RawDescriptor: file_mlops_scheduler_db_dump_proto_rawDesc, - NumEnums: 1, - NumMessages: 2, - NumExtensions: 0, - NumServices: 1, - }, - GoTypes: file_mlops_scheduler_db_dump_proto_goTypes, - DependencyIndexes: file_mlops_scheduler_db_dump_proto_depIdxs, - EnumInfos: file_mlops_scheduler_db_dump_proto_enumTypes, - MessageInfos: file_mlops_scheduler_db_dump_proto_msgTypes, - }.Build() - File_mlops_scheduler_db_dump_proto = out.File - file_mlops_scheduler_db_dump_proto_rawDesc = nil - file_mlops_scheduler_db_dump_proto_goTypes = nil - file_mlops_scheduler_db_dump_proto_depIdxs = nil -} diff --git a/apis/go/mlops/scheduler/dbdump/db-dump_grpc.pb.go b/apis/go/mlops/scheduler/dbdump/db-dump_grpc.pb.go deleted file mode 100644 index eddefe420d..0000000000 --- a/apis/go/mlops/scheduler/dbdump/db-dump_grpc.pb.go +++ /dev/null @@ -1,119 +0,0 @@ -/* -Copyright (c) 2024 Seldon Technologies Ltd. - -Use of this software is governed BY -(1) the license included in the LICENSE file or -(2) if the license included in the LICENSE file is the Business Source License 1.1, -the Change License after the Change Date as each is defined in accordance with the LICENSE file. -*/ - -// Code generated by protoc-gen-go-grpc. DO NOT EDIT. -// versions: -// - protoc-gen-go-grpc v1.4.0 -// - protoc v5.27.2 -// source: mlops/scheduler/db-dump.proto - -package dbdump - -import ( - context "context" - grpc "google.golang.org/grpc" - codes "google.golang.org/grpc/codes" - status "google.golang.org/grpc/status" -) - -// This is a compile-time assertion to ensure that this generated file -// is compatible with the grpc package it is being compiled against. -// Requires gRPC-Go v1.62.0 or later. -const _ = grpc.SupportPackageIsVersion8 - -const ( - DatabaseService_DumpDatabase_FullMethodName = "/seldon.mlops.scheduler.dbdump.DatabaseService/DumpDatabase" -) - -// DatabaseServiceClient is the client API for DatabaseService service. -// -// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. -type DatabaseServiceClient interface { - DumpDatabase(ctx context.Context, in *DatabaseDumpRequest, opts ...grpc.CallOption) (*DatabaseDumpResponse, error) -} - -type databaseServiceClient struct { - cc grpc.ClientConnInterface -} - -func NewDatabaseServiceClient(cc grpc.ClientConnInterface) DatabaseServiceClient { - return &databaseServiceClient{cc} -} - -func (c *databaseServiceClient) DumpDatabase(ctx context.Context, in *DatabaseDumpRequest, opts ...grpc.CallOption) (*DatabaseDumpResponse, error) { - cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...) - out := new(DatabaseDumpResponse) - err := c.cc.Invoke(ctx, DatabaseService_DumpDatabase_FullMethodName, in, out, cOpts...) - if err != nil { - return nil, err - } - return out, nil -} - -// DatabaseServiceServer is the server API for DatabaseService service. -// All implementations must embed UnimplementedDatabaseServiceServer -// for forward compatibility -type DatabaseServiceServer interface { - DumpDatabase(context.Context, *DatabaseDumpRequest) (*DatabaseDumpResponse, error) - mustEmbedUnimplementedDatabaseServiceServer() -} - -// UnimplementedDatabaseServiceServer must be embedded to have forward compatible implementations. -type UnimplementedDatabaseServiceServer struct { -} - -func (UnimplementedDatabaseServiceServer) DumpDatabase(context.Context, *DatabaseDumpRequest) (*DatabaseDumpResponse, error) { - return nil, status.Errorf(codes.Unimplemented, "method DumpDatabase not implemented") -} -func (UnimplementedDatabaseServiceServer) mustEmbedUnimplementedDatabaseServiceServer() {} - -// UnsafeDatabaseServiceServer may be embedded to opt out of forward compatibility for this service. -// Use of this interface is not recommended, as added methods to DatabaseServiceServer will -// result in compilation errors. -type UnsafeDatabaseServiceServer interface { - mustEmbedUnimplementedDatabaseServiceServer() -} - -func RegisterDatabaseServiceServer(s grpc.ServiceRegistrar, srv DatabaseServiceServer) { - s.RegisterService(&DatabaseService_ServiceDesc, srv) -} - -func _DatabaseService_DumpDatabase_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { - in := new(DatabaseDumpRequest) - if err := dec(in); err != nil { - return nil, err - } - if interceptor == nil { - return srv.(DatabaseServiceServer).DumpDatabase(ctx, in) - } - info := &grpc.UnaryServerInfo{ - Server: srv, - FullMethod: DatabaseService_DumpDatabase_FullMethodName, - } - handler := func(ctx context.Context, req interface{}) (interface{}, error) { - return srv.(DatabaseServiceServer).DumpDatabase(ctx, req.(*DatabaseDumpRequest)) - } - return interceptor(ctx, in, info, handler) -} - -// DatabaseService_ServiceDesc is the grpc.ServiceDesc for DatabaseService service. -// It's only intended for direct use with grpc.RegisterService, -// and not to be introspected or modified (even as a copy) -var DatabaseService_ServiceDesc = grpc.ServiceDesc{ - ServiceName: "seldon.mlops.scheduler.dbdump.DatabaseService", - HandlerType: (*DatabaseServiceServer)(nil), - Methods: []grpc.MethodDesc{ - { - MethodName: "DumpDatabase", - Handler: _DatabaseService_DumpDatabase_Handler, - }, - }, - Streams: []grpc.StreamDesc{}, - Metadata: "mlops/scheduler/db-dump.proto", -} diff --git a/apis/mlops/scheduler/db-dump.proto b/apis/mlops/scheduler/db-dump.proto deleted file mode 100644 index 7b8b5613c8..0000000000 --- a/apis/mlops/scheduler/db-dump.proto +++ /dev/null @@ -1,27 +0,0 @@ -syntax = "proto3"; - -package seldon.mlops.scheduler.dbdump; - -option go_package = "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/dbdump"; - -import "mlops/scheduler/db/db.proto"; - -message DatabaseDumpRequest { - enum DatabaseDumpRecords { - DUMP_UNKNOWN = 0; - DUMP_ALL = 1; - DUMP_MODELS = 2; - DUMP_SERVERS = 3; - } - DatabaseDumpRecords action = 1; -} - -message DatabaseDumpResponse { - repeated db.Model model = 1; - repeated db.Server server = 2; -} - - -service DatabaseService { - rpc DumpDatabase(DatabaseDumpRequest) returns (DatabaseDumpResponse) {}; -} \ No newline at end of file From 8242d4c6fdd9b947b9bdf7d3125328c30cb830dc Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Thu, 8 Jan 2026 14:38:56 +0000 Subject: [PATCH 21/50] fix circular import, wip fixing tests --- scheduler/cmd/scheduler/main.go | 3 +- .../store.go => in_memory_store.go} | 33 +- scheduler/pkg/store/memory_status_test.go | 293 ++++++++++-------- 3 files changed, 175 insertions(+), 154 deletions(-) rename scheduler/pkg/store/{in-memory/store.go => in_memory_store.go} (65%) diff --git a/scheduler/cmd/scheduler/main.go b/scheduler/cmd/scheduler/main.go index 3ebdfb3ea9..9503482081 100644 --- a/scheduler/cmd/scheduler/main.go +++ b/scheduler/cmd/scheduler/main.go @@ -24,7 +24,6 @@ import ( "time" "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" - in_memory "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/in-memory" log "github.com/sirupsen/logrus" "google.golang.org/grpc" "google.golang.org/grpc/backoff" @@ -300,7 +299,7 @@ func main() { } // Create stores - ss := store.NewMemoryStore(logger, in_memory.NewStorage[*db.Model](), in_memory.NewStorage[*db.Server](), eventHub) + ss := store.NewMemoryStore(logger, store.NewInMemoryStorage[*db.Model](), store.NewInMemoryStorage[*db.Server](), eventHub) ps := pipeline.NewPipelineStore(logger, eventHub, ss) es := experiment.NewExperimentServer(logger, eventHub, ss, ps) cleaner := cleaner.NewVersionCleaner(ss, logger) diff --git a/scheduler/pkg/store/in-memory/store.go b/scheduler/pkg/store/in_memory_store.go similarity index 65% rename from scheduler/pkg/store/in-memory/store.go rename to scheduler/pkg/store/in_memory_store.go index c1d194c16f..f1be6e5c6e 100644 --- a/scheduler/pkg/store/in-memory/store.go +++ b/scheduler/pkg/store/in_memory_store.go @@ -7,18 +7,17 @@ Use of this software is governed BY the Change License after the Change Date as each is defined in accordance with the LICENSE file. */ -package in_memory +package store import ( "context" "sync" "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" - "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" "google.golang.org/protobuf/proto" ) -type Storage[T interface { +type StorageInMemory[T interface { proto.Message GetName() string }] struct { @@ -26,43 +25,43 @@ type Storage[T interface { records map[string]T } -var _ store.Storage[*db.Model] = &Storage[*db.Model]{} -var _ store.Storage[*db.Server] = &Storage[*db.Server]{} +var _ Storage[*db.Model] = &StorageInMemory[*db.Model]{} +var _ Storage[*db.Server] = &StorageInMemory[*db.Server]{} -func NewStorage[T interface { +func NewInMemoryStorage[T interface { proto.Message GetName() string -}]() *Storage[T] { - return &Storage[T]{ +}]() *StorageInMemory[T] { + return &StorageInMemory[T]{ records: make(map[string]T), } } -func (s *Storage[T]) Get(_ context.Context, id string) (T, error) { +func (s *StorageInMemory[T]) Get(_ context.Context, id string) (T, error) { s.mu.RLock() defer s.mu.RUnlock() record, ok := s.records[id] if !ok { - return *new(T), store.ErrNotFound + return *new(T), ErrNotFound } return proto.Clone(record).(T), nil } -func (s *Storage[T]) Insert(_ context.Context, record T) error { +func (s *StorageInMemory[T]) Insert(_ context.Context, record T) error { s.mu.Lock() defer s.mu.Unlock() if _, ok := s.records[record.GetName()]; ok { - return store.ErrAlreadyExists + return ErrAlreadyExists } s.records[record.GetName()] = proto.Clone(record).(T) return nil } -func (s *Storage[T]) List(_ context.Context) ([]T, error) { +func (s *StorageInMemory[T]) List(_ context.Context) ([]T, error) { s.mu.RLock() defer s.mu.RUnlock() @@ -74,24 +73,24 @@ func (s *Storage[T]) List(_ context.Context) ([]T, error) { return records, nil } -func (s *Storage[T]) Update(_ context.Context, record T) error { +func (s *StorageInMemory[T]) Update(_ context.Context, record T) error { s.mu.Lock() defer s.mu.Unlock() if _, ok := s.records[record.GetName()]; !ok { - return store.ErrNotFound + return ErrNotFound } s.records[record.GetName()] = proto.Clone(record).(T) return nil } -func (s *Storage[T]) Delete(_ context.Context, id string) error { +func (s *StorageInMemory[T]) Delete(_ context.Context, id string) error { s.mu.Lock() defer s.mu.Unlock() if _, ok := s.records[id]; !ok { - return store.ErrNotFound + return ErrNotFound } delete(s.records, id) diff --git a/scheduler/pkg/store/memory_status_test.go b/scheduler/pkg/store/memory_status_test.go index 07d3d194ab..d73cb23b1a 100644 --- a/scheduler/pkg/store/memory_status_test.go +++ b/scheduler/pkg/store/memory_status_test.go @@ -10,83 +10,86 @@ the Change License after the Change Date as each is defined in accordance with t package store import ( + "context" "testing" . "github.com/onsi/gomega" log "github.com/sirupsen/logrus" pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" ) func TestUpdateStatus(t *testing.T) { g := NewGomegaWithT(t) + ctx := context.Background() type test struct { name string - store *LocalSchedulerStore + models []*db.Model + servers []*db.Server modelName string serverName string version uint32 prevVersion *uint32 - expectedModelStatus ModelState + expectedModelStatus db.ModelState } prevVersion := uint32(1) tests := []test{ { name: "Available", - store: &LocalSchedulerStore{ - models: map[string]*Model{ - "model": { - versions: []*ModelVersion{ - { - version: 1, - modelDefn: &pb.Model{ - Meta: &pb.MetaData{ - Name: "model", - }, - ModelSpec: &pb.ModelSpec{}, - DeploymentSpec: &pb.DeploymentSpec{ - Replicas: 1, - }, + models: []*db.Model{ + { + Name: "model", + Versions: []*db.ModelVersion{ + { + Version: 1, + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "model", }, - server: "server2", - replicas: map[int]ReplicaStatus{ - 0: {State: Loaded}, + ModelSpec: &pb.ModelSpec{}, + DeploymentSpec: &pb.DeploymentSpec{ + Replicas: 1, }, }, - { - version: 2, - modelDefn: &pb.Model{ - Meta: &pb.MetaData{ - Name: "model", - }, - ModelSpec: &pb.ModelSpec{}, - DeploymentSpec: &pb.DeploymentSpec{ - Replicas: 1, - }, + Server: "server2", + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Loaded}, + }, + }, + { + Version: 2, + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "model", }, - server: "server1", - replicas: map[int]ReplicaStatus{ - 0: {State: Available}, + ModelSpec: &pb.ModelSpec{}, + DeploymentSpec: &pb.DeploymentSpec{ + Replicas: 1, }, }, + Server: "server1", + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Available}, + }, }, }, }, - servers: map[string]*Server{ - "server1": { - name: "server1", - replicas: map[int]*ServerReplica{ - 0: {}, - }, + }, + servers: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: {}, }, - "server2": { - name: "server2", - replicas: map[int]*ServerReplica{ - 0: {}, - }, + }, + { + Name: "server2", + Replicas: map[int32]*db.ServerReplica{ + 0: {}, }, }, }, @@ -94,62 +97,61 @@ func TestUpdateStatus(t *testing.T) { serverName: "server2", version: 2, prevVersion: nil, - expectedModelStatus: ModelAvailable, + expectedModelStatus: db.ModelState_ModelAvailable, }, { name: "Available - Min replicas", - store: &LocalSchedulerStore{ - models: map[string]*Model{ - "model": { - versions: []*ModelVersion{ - { - version: 1, - modelDefn: &pb.Model{ - Meta: &pb.MetaData{ - Name: "model", - }, - ModelSpec: &pb.ModelSpec{}, - DeploymentSpec: &pb.DeploymentSpec{ - Replicas: 1, - }, + models: []*db.Model{ + { + Name: "model", + Versions: []*db.ModelVersion{ + { + Version: 1, + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "model", }, - server: "server2", - replicas: map[int]ReplicaStatus{ - 0: {State: Loaded}, + ModelSpec: &pb.ModelSpec{}, + DeploymentSpec: &pb.DeploymentSpec{ + Replicas: 1, }, }, - { - version: 2, - modelDefn: &pb.Model{ - Meta: &pb.MetaData{ - Name: "model", - }, - ModelSpec: &pb.ModelSpec{}, - DeploymentSpec: &pb.DeploymentSpec{ - Replicas: 2, - MinReplicas: 1, - }, + Server: "server2", + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Loaded}, + }, + }, + { + Version: 2, + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "model", }, - server: "server1", - replicas: map[int]ReplicaStatus{ - 0: {State: Available}, + ModelSpec: &pb.ModelSpec{}, + DeploymentSpec: &pb.DeploymentSpec{ + Replicas: 2, + MinReplicas: 1, }, }, + Server: "server1", + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Available}, + }, }, }, }, - servers: map[string]*Server{ - "server1": { - name: "server1", - replicas: map[int]*ServerReplica{ - 0: {}, - }, + }, + servers: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: {}, }, - "server2": { - name: "server2", - replicas: map[int]*ServerReplica{ - 0: {}, - }, + }, + { + Name: "server2", + Replicas: map[int32]*db.ServerReplica{ + 0: {}, }, }, }, @@ -157,62 +159,61 @@ func TestUpdateStatus(t *testing.T) { serverName: "server2", version: 2, prevVersion: nil, - expectedModelStatus: ModelAvailable, + expectedModelStatus: db.ModelState_ModelAvailable, }, { name: "NotEnoughReplicasButPreviousAvailable", - store: &LocalSchedulerStore{ - models: map[string]*Model{ - "model": { - versions: []*ModelVersion{ - { - version: 1, - modelDefn: &pb.Model{ - Meta: &pb.MetaData{ - Name: "model", - }, - ModelSpec: &pb.ModelSpec{}, - DeploymentSpec: &pb.DeploymentSpec{ - Replicas: 1, - }, + models: []*db.Model{ + { + Name: "model", + Versions: []*db.ModelVersion{ + { + Version: 1, + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "model", }, - server: "server2", - replicas: map[int]ReplicaStatus{ - 0: {State: Available}, + ModelSpec: &pb.ModelSpec{}, + DeploymentSpec: &pb.DeploymentSpec{ + Replicas: 1, }, - state: ModelStatus{State: ModelAvailable}, }, - { - version: 2, - modelDefn: &pb.Model{ - Meta: &pb.MetaData{ - Name: "model", - }, - ModelSpec: &pb.ModelSpec{}, - DeploymentSpec: &pb.DeploymentSpec{ - Replicas: 2, - }, + Server: "server2", + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Available}, + }, + State: &db.ModelStatus{State: db.ModelState_ModelAvailable}, + }, + { + Version: 2, + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "model", }, - server: "server1", - replicas: map[int]ReplicaStatus{ - 0: {State: Available}, + ModelSpec: &pb.ModelSpec{}, + DeploymentSpec: &pb.DeploymentSpec{ + Replicas: 2, }, }, + Server: "server1", + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Available}, + }, }, }, }, - servers: map[string]*Server{ - "server1": { - name: "server1", - replicas: map[int]*ServerReplica{ - 0: {}, - }, + }, + servers: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: {}, }, - "server2": { - name: "server2", - replicas: map[int]*ServerReplica{ - 0: {}, - }, + }, + { + Name: "server2", + Replicas: map[int32]*db.ServerReplica{ + 0: {}, }, }, }, @@ -220,7 +221,7 @@ func TestUpdateStatus(t *testing.T) { serverName: "server2", version: 2, prevVersion: &prevVersion, - expectedModelStatus: ModelAvailable, + expectedModelStatus: db.ModelState_ModelAvailable, }, } for _, test := range tests { @@ -228,17 +229,39 @@ func TestUpdateStatus(t *testing.T) { logger := log.New() eventHub, err := coordinator.NewEventHub(logger) g.Expect(err).To(BeNil()) - ms := NewMemoryStore(logger, test.store, eventHub) + + // Create storage instances + modelStorage := NewInMemoryStorage[*db.Model]() + serverStorage := NewInMemoryStorage[*db.Server]() + + // Populate storage with test data + for _, model := range test.models { + err := modelStorage.Insert(ctx, model) + g.Expect(err).To(BeNil()) + } + for _, server := range test.servers { + err := serverStorage.Insert(ctx, server) + g.Expect(err).To(BeNil()) + } + + // Create MemoryStore with populated storage + ms := NewMemoryStore(logger, modelStorage, serverStorage, eventHub) + + // Get model and version for testing model, modelVersion, _, err := ms.getModelServer(test.modelName, test.version, test.serverName) - var prevModelVersion *ModelVersion + g.Expect(err).To(BeNil()) + + var prevModelVersion *db.ModelVersion if test.prevVersion != nil { _, prevModelVersion, _, err = ms.getModelServer(test.modelName, *test.prevVersion, test.serverName) g.Expect(err).To(BeNil()) } - g.Expect(err).To(BeNil()) + + // Update model status isLatest := model.Latest().GetVersion() == modelVersion.GetVersion() - ms.updateModelStatus(isLatest, model.IsDeleted(), modelVersion, prevModelVersion) - g.Expect(modelVersion.state.State).To(Equal(test.expectedModelStatus)) + err = ms.updateModelStatus(isLatest, model.Deleted, modelVersion, prevModelVersion, model) + g.Expect(err).To(BeNil()) + g.Expect(modelVersion.State.State).To(Equal(test.expectedModelStatus)) }) } } From d9e75beae4672f3bf71830443f335b2e6f8cd800 Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Thu, 8 Jan 2026 17:29:46 +0000 Subject: [PATCH 22/50] wip: fixing tests --- scheduler/pkg/store/memory_test.go | 902 +++++++++++++++++------------ 1 file changed, 532 insertions(+), 370 deletions(-) diff --git a/scheduler/pkg/store/memory_test.go b/scheduler/pkg/store/memory_test.go index 31d259f84a..3e0b2ca8aa 100644 --- a/scheduler/pkg/store/memory_test.go +++ b/scheduler/pkg/store/memory_test.go @@ -10,27 +10,27 @@ the Change License after the Change Date as each is defined in accordance with t package store import ( + "context" "errors" - "sync" - "sync/atomic" "testing" - "time" . "github.com/onsi/gomega" log "github.com/sirupsen/logrus" + "google.golang.org/protobuf/proto" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/agent" pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" ) func TestUpdateModel(t *testing.T) { g := NewGomegaWithT(t) + ctx := context.Background() type test struct { name string - store *LocalSchedulerStore + models []*db.Model loadModelReq *pb.LoadModelRequest expectedVersion uint32 err error @@ -38,8 +38,8 @@ func TestUpdateModel(t *testing.T) { tests := []test{ { - name: "simple", - store: NewLocalSchedulerStore(), + name: "simple", + models: []*db.Model{}, loadModelReq: &pb.LoadModelRequest{ Model: &pb.Model{ Meta: &pb.MetaData{ @@ -50,8 +50,8 @@ func TestUpdateModel(t *testing.T) { expectedVersion: 1, }, { - name: "simple with generation", - store: NewLocalSchedulerStore(), + name: "simple with generation", + models: []*db.Model{}, loadModelReq: &pb.LoadModelRequest{ Model: &pb.Model{ Meta: &pb.MetaData{ @@ -66,18 +66,18 @@ func TestUpdateModel(t *testing.T) { }, { name: "VersionAlreadyExists", - store: &LocalSchedulerStore{ - models: map[string]*Model{ - "model": { - versions: []*ModelVersion{ - { - version: 1, - modelDefn: &pb.Model{ - Meta: &pb.MetaData{ - Name: "model", - }, + models: []*db.Model{ + { + Name: "model", + Versions: []*db.ModelVersion{ + { + Version: 1, + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "model", }, }, + Replicas: make(map[int32]*db.ReplicaStatus), }, }, }, @@ -93,18 +93,18 @@ func TestUpdateModel(t *testing.T) { }, { name: "Meta data is changed - no new version created assuming same name of model", - store: &LocalSchedulerStore{ - models: map[string]*Model{ - "model": { - versions: []*ModelVersion{ - { - version: 1, - modelDefn: &pb.Model{ - Meta: &pb.MetaData{ - Name: "model", - }, + models: []*db.Model{ + { + Name: "model", + Versions: []*db.ModelVersion{ + { + Version: 1, + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "model", }, }, + Replicas: make(map[int32]*db.ReplicaStatus), }, }, }, @@ -123,24 +123,24 @@ func TestUpdateModel(t *testing.T) { }, { name: "DeploymentSpecDiffers", - store: &LocalSchedulerStore{ - models: map[string]*Model{ - "model": { - versions: []*ModelVersion{ - { - version: 1, - modelDefn: &pb.Model{ - Meta: &pb.MetaData{ - Name: "model", - }, - ModelSpec: &pb.ModelSpec{ - Uri: "gs:/models/iris", - }, - DeploymentSpec: &pb.DeploymentSpec{ - Replicas: 2, - }, + models: []*db.Model{ + { + Name: "model", + Versions: []*db.ModelVersion{ + { + Version: 1, + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "model", + }, + ModelSpec: &pb.ModelSpec{ + Uri: "gs:/models/iris", + }, + DeploymentSpec: &pb.DeploymentSpec{ + Replicas: 2, }, }, + Replicas: make(map[int32]*db.ReplicaStatus), }, }, }, @@ -162,24 +162,24 @@ func TestUpdateModel(t *testing.T) { }, { name: "ModelSpecDiffers", - store: &LocalSchedulerStore{ - models: map[string]*Model{ - "model": { - versions: []*ModelVersion{ - { - version: 1, - modelDefn: &pb.Model{ - Meta: &pb.MetaData{ - Name: "model", - }, - ModelSpec: &pb.ModelSpec{ - Uri: "gs:/models/iris", - }, - DeploymentSpec: &pb.DeploymentSpec{ - Replicas: 2, - }, + models: []*db.Model{ + { + Name: "model", + Versions: []*db.ModelVersion{ + { + Version: 1, + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "model", + }, + ModelSpec: &pb.ModelSpec{ + Uri: "gs:/models/iris", + }, + DeploymentSpec: &pb.DeploymentSpec{ + Replicas: 2, }, }, + Replicas: make(map[int32]*db.ReplicaStatus), }, }, }, @@ -200,8 +200,8 @@ func TestUpdateModel(t *testing.T) { expectedVersion: 2, }, { - name: "ModelNameIsNotValid", - store: NewLocalSchedulerStore(), + name: "ModelNameIsNotValid", + models: []*db.Model{}, loadModelReq: &pb.LoadModelRequest{ Model: &pb.Model{ Meta: &pb.MetaData{ @@ -210,7 +210,7 @@ func TestUpdateModel(t *testing.T) { }, }, expectedVersion: 1, - err: errors.New("Model this.Name does not have a valid name - it must be alphanumeric and not contains dots (.)"), + err: errors.New("model this.Name does not have a valid name - it must be alphanumeric and not contains dots (.)"), }, } @@ -219,27 +219,46 @@ func TestUpdateModel(t *testing.T) { logger := log.New() eventHub, err := coordinator.NewEventHub(logger) g.Expect(err).To(BeNil()) - ms := NewMemoryStore(logger, test.store, eventHub) + + // Create storage instances + modelStorage := NewInMemoryStorage[*db.Model]() + serverStorage := NewInMemoryStorage[*db.Server]() + + // Populate storage with test data + for _, model := range test.models { + err := modelStorage.Insert(ctx, model) + g.Expect(err).To(BeNil()) + } + + // Create MemoryStore with populated storage + ms := NewMemoryStore(logger, modelStorage, serverStorage, eventHub) + err = ms.UpdateModel(test.loadModelReq) if test.err != nil { g.Expect(err.Error()).To(BeIdenticalTo(test.err.Error())) - } else { - g.Expect(err).To(BeNil()) - m := test.store.models[test.loadModelReq.GetModel().GetMeta().GetName()] - latest := m.Latest() - g.Expect(latest.modelDefn).To(Equal(test.loadModelReq.Model)) - g.Expect(latest.GetVersion()).To(Equal(test.expectedVersion)) + return } + + g.Expect(err).To(BeNil()) + model, err := modelStorage.Get(ctx, test.loadModelReq.GetModel().GetMeta().GetName()) + g.Expect(err).To(BeNil()) + latest := model.Latest() + + g.Expect(proto.Equal(latest.ModelDefn, test.loadModelReq.Model)).To(BeTrue()) + + g.Expect(latest.ModelDefn).To(Equal(test.loadModelReq.Model)) + g.Expect(latest.GetVersion()).To(Equal(test.expectedVersion)) }) } } func TestGetModel(t *testing.T) { g := NewGomegaWithT(t) + ctx := context.Background() type test struct { name string - store *LocalSchedulerStore + models []*db.Model key string versions int err error @@ -248,24 +267,24 @@ func TestGetModel(t *testing.T) { tests := []test{ { name: "NoModel", - store: NewLocalSchedulerStore(), + models: []*db.Model{}, key: "model", versions: 0, - err: nil, + err: ErrNotFound, }, { name: "VersionAlreadyExists", - store: &LocalSchedulerStore{ - models: map[string]*Model{ - "model": { - versions: []*ModelVersion{ - { - modelDefn: &pb.Model{ - Meta: &pb.MetaData{ - Name: "model", - }, + models: []*db.Model{ + { + Name: "model", + Versions: []*db.ModelVersion{ + { + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "model", }, }, + Replicas: make(map[int32]*db.ReplicaStatus), }, }, }, @@ -281,7 +300,20 @@ func TestGetModel(t *testing.T) { logger := log.New() eventHub, err := coordinator.NewEventHub(logger) g.Expect(err).To(BeNil()) - ms := NewMemoryStore(logger, test.store, eventHub) + + // Create storage instances + modelStorage := NewInMemoryStorage[*db.Model]() + serverStorage := NewInMemoryStorage[*db.Server]() + + // Populate storage with test data + for _, model := range test.models { + err := modelStorage.Insert(ctx, model) + g.Expect(err).To(BeNil()) + } + + // Create MemoryStore with populated storage + ms := NewMemoryStore(logger, modelStorage, serverStorage, eventHub) + model, err := ms.GetModel(test.key) if test.err == nil { g.Expect(err).To(BeNil()) @@ -296,156 +328,158 @@ func TestGetModel(t *testing.T) { func TestGetServer(t *testing.T) { g := NewGomegaWithT(t) + ctx := context.Background() type test struct { name string - store *LocalSchedulerStore + models []*db.Model + servers []*db.Server key string isErr bool - expected *ServerSnapshot + expected *db.Server } tests := []test{ { name: "NoServer", - store: NewLocalSchedulerStore(), + models: []*db.Model{}, + servers: []*db.Server{}, key: "server", isErr: true, expected: nil, }, { - name: "ServerExists", - store: &LocalSchedulerStore{ - servers: map[string]*Server{ - "server": { - name: "server", - replicas: map[int]*ServerReplica{ - 0: {}, - }, - expectedReplicas: 1, - minReplicas: 0, - maxReplicas: 0, + name: "ServerExists", + models: []*db.Model{}, + servers: []*db.Server{ + { + Name: "server", + Replicas: map[int32]*db.ServerReplica{ + 0: {}, }, + ExpectedReplicas: 1, + MinReplicas: 0, + MaxReplicas: 0, }, }, key: "server", isErr: false, - expected: &ServerSnapshot{ + expected: &db.Server{ Name: "server", ExpectedReplicas: 1, MinReplicas: 0, MaxReplicas: 0, - Stats: &ServerStats{ - NumEmptyReplicas: 1, - MaxNumReplicaHostedModels: 0, - }, - Replicas: map[int]*ServerReplica{ - 0: { - loadedModels: map[ModelVersionID]bool{}, - }, + Replicas: map[int32]*db.ServerReplica{ + 0: {}, }, }, }, { name: "ServerExistsWithModel", - store: &LocalSchedulerStore{ - models: map[string]*Model{ - "model": { - versions: []*ModelVersion{ - { - modelDefn: &pb.Model{ - Meta: &pb.MetaData{ - Name: "model", - }, + models: []*db.Model{ + { + Name: "model", + Versions: []*db.ModelVersion{ + { + Version: 1, + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "model", }, }, + Replicas: make(map[int32]*db.ReplicaStatus), }, }, }, - servers: map[string]*Server{ - "server": { - name: "server", - replicas: map[int]*ServerReplica{ - 0: { - loadedModels: map[ModelVersionID]bool{ - {Name: "model", Version: 1}: true, - }}, + }, + servers: []*db.Server{ + { + Name: "server", + Replicas: map[int32]*db.ServerReplica{ + 0: { + LoadedModels: []*db.ModelVersionID{{ + Version: 1, + Name: "model:1", + }, + }, }, - expectedReplicas: 1, - minReplicas: 0, - maxReplicas: 0, }, + ExpectedReplicas: 1, + MinReplicas: 0, + MaxReplicas: 0, }, }, key: "server", isErr: false, - expected: &ServerSnapshot{ + expected: &db.Server{ Name: "server", ExpectedReplicas: 1, MinReplicas: 0, MaxReplicas: 0, - Stats: &ServerStats{ - NumEmptyReplicas: 0, - MaxNumReplicaHostedModels: 1, - }, - Replicas: map[int]*ServerReplica{ + Replicas: map[int32]*db.ServerReplica{ 0: { - loadedModels: map[ModelVersionID]bool{ - {Name: "model", Version: 1}: true, + LoadedModels: []*db.ModelVersionID{ + { + Name: "model", + Version: 1, + }, }}, }, }, }, { name: "ServerWithEmptyReplicas", - store: &LocalSchedulerStore{ - models: map[string]*Model{ - "model": { - versions: []*ModelVersion{ - { - modelDefn: &pb.Model{ - Meta: &pb.MetaData{ - Name: "model", - }, + models: []*db.Model{ + { + Name: "model", + Versions: []*db.ModelVersion{ + { + Version: 1, + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "model", }, }, + Replicas: make(map[int32]*db.ReplicaStatus), }, }, }, - servers: map[string]*Server{ - "server": { - name: "server", - replicas: map[int]*ServerReplica{ - 0: { - loadedModels: map[ModelVersionID]bool{ - {Name: "model", Version: 1}: true, - }}, - 1: {}, + }, + servers: []*db.Server{ + { + Name: "server", + Replicas: map[int32]*db.ServerReplica{ + 0: { + LoadedModels: []*db.ModelVersionID{{ + Version: 1, + Name: "model", + }, + }, }, - expectedReplicas: 1, - minReplicas: 0, - maxReplicas: 0, + 1: {}, }, + ExpectedReplicas: 1, + MinReplicas: 0, + MaxReplicas: 0, }, }, key: "server", isErr: false, - expected: &ServerSnapshot{ + expected: &db.Server{ Name: "server", ExpectedReplicas: 1, MinReplicas: 0, MaxReplicas: 0, - Stats: &ServerStats{ - NumEmptyReplicas: 1, - MaxNumReplicaHostedModels: 1, - }, - Replicas: map[int]*ServerReplica{ + Replicas: map[int32]*db.ServerReplica{ 0: { - loadedModels: map[ModelVersionID]bool{ - {Name: "model", Version: 1}: true, - }}, + LoadedModels: []*db.ModelVersionID{{ + Version: 1, + Name: "model", + }, + }, + }, 1: { - loadedModels: map[ModelVersionID]bool{}, + LoadedModels: []*db.ModelVersionID{}, }, }, }, @@ -457,24 +491,41 @@ func TestGetServer(t *testing.T) { logger := log.New() eventHub, err := coordinator.NewEventHub(logger) g.Expect(err).To(BeNil()) - ms := NewMemoryStore(logger, test.store, eventHub) - server, err := ms.GetServer(test.key, false, true) + + // Create storage instances + modelStorage := NewInMemoryStorage[*db.Model]() + serverStorage := NewInMemoryStorage[*db.Server]() + + // Populate storage with test data + for _, model := range test.models { + err := modelStorage.Insert(ctx, model) + g.Expect(err).To(BeNil()) + } + for _, server := range test.servers { + err := serverStorage.Insert(ctx, server) + g.Expect(err).To(BeNil()) + } + + // Create MemoryStore with populated storage + ms := NewMemoryStore(logger, modelStorage, serverStorage, eventHub) + + server, _, err := ms.GetServer(test.key, false, true) if !test.isErr { g.Expect(err).To(BeNil()) g.Expect(server.Name).To(Equal(test.expected.Name)) g.Expect(server.ExpectedReplicas).To(Equal(test.expected.ExpectedReplicas)) for k, v := range server.Replicas { - g.Expect(v.loadedModels).To(Equal(test.expected.Replicas[k].loadedModels)) + g.Expect(v.LoadedModels).To(Equal(test.expected.Replicas[k].LoadedModels)) } } else { g.Expect(err).ToNot(BeNil()) } // no details - server, _ = ms.GetServer(test.key, false, false) + server, _, err = ms.GetServer(test.key, false, false) if !test.isErr { for _, v := range server.Replicas { - g.Expect(len(v.loadedModels)).To(Equal(0)) + g.Expect(len(v.LoadedModels)).To(Equal(0)) } } }) @@ -483,39 +534,44 @@ func TestGetServer(t *testing.T) { func TestRemoveModel(t *testing.T) { g := NewGomegaWithT(t) + ctx := context.Background() type test struct { - name string - store *LocalSchedulerStore - key string - err bool + name string + models []*db.Model + servers []*db.Server + key string + err bool } tests := []test{ { - name: "NoModel", - store: NewLocalSchedulerStore(), - key: "model", - err: true, + name: "NoModel", + models: []*db.Model{}, + servers: []*db.Server{}, + key: "model", + err: true, }, { name: "VersionAlreadyExists", - store: &LocalSchedulerStore{ - models: map[string]*Model{ - "model": { - versions: []*ModelVersion{ - { - modelDefn: &pb.Model{ - Meta: &pb.MetaData{ - Name: "model", - }, + models: []*db.Model{ + { + Name: "model", + Versions: []*db.ModelVersion{ + { + Version: 1, + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "model", }, }, + Replicas: make(map[int32]*db.ReplicaStatus), }, }, }, }, - key: "model", + servers: []*db.Server{}, + key: "model", }, } @@ -524,7 +580,24 @@ func TestRemoveModel(t *testing.T) { logger := log.New() eventHub, err := coordinator.NewEventHub(logger) g.Expect(err).To(BeNil()) - ms := NewMemoryStore(logger, test.store, eventHub) + + // Create storage instances + modelStorage := NewInMemoryStorage[*db.Model]() + serverStorage := NewInMemoryStorage[*db.Server]() + + // Populate storage with test data + for _, model := range test.models { + err := modelStorage.Insert(ctx, model) + g.Expect(err).To(BeNil()) + } + for _, server := range test.servers { + err := serverStorage.Insert(ctx, server) + g.Expect(err).To(BeNil()) + } + + // Create MemoryStore with populated storage + ms := NewMemoryStore(logger, modelStorage, serverStorage, eventHub) + err = ms.RemoveModel(&pb.UnloadModelRequest{Model: &pb.ModelReference{Name: test.key}}) if !test.err { g.Expect(err).To(BeNil()) @@ -535,21 +608,28 @@ func TestRemoveModel(t *testing.T) { } } +// TODO: This test requires refactoring for the new storage architecture. +// It tests internal implementation details (updateLoadedModelsImpl) that work directly +// with LocalSchedulerStore. The new architecture uses Storage interfaces and may need +// a different approach to testing these internal methods. +/* func TestUpdateLoadedModels(t *testing.T) { g := NewGomegaWithT(t) + ctx := context.Background() memBytes := uint64(1) type test struct { name string - store *LocalSchedulerStore + models []*db.Model + servers []*db.Server modelKey string version uint32 serverKey string - replicas []*ServerReplica - expectedStates map[int]ReplicaStatus + replicas []*db.ServerReplica + expectedStates map[int]db.ModelReplicaState err bool isModelDeleted bool - expectedModelState *ModelStatus + expectedModelState *db.ModelStatus } tests := []test{ @@ -985,7 +1065,13 @@ func TestUpdateLoadedModels(t *testing.T) { }) } } +*/ +// TODO: This test requires refactoring for the new storage architecture. +// It tests internal implementation details (UpdateModelState) that work directly +// with LocalSchedulerStore. The new architecture uses Storage interfaces and may need +// a different approach to testing these internal methods. +/* func TestUpdateModelState(t *testing.T) { g := NewGomegaWithT(t) memBytes := uint64(1) @@ -1354,7 +1440,13 @@ func TestUpdateModelState(t *testing.T) { }) } } +*/ +// TODO: This test requires refactoring for the new storage architecture. +// It tests internal implementation details (updateModelStatus) that work directly +// with LocalSchedulerStore internal types. The new architecture uses Storage interfaces +// and may need a different approach to testing these internal methods. +/* func TestUpdateModelStatus(t *testing.T) { g := NewGomegaWithT(t) @@ -1626,7 +1718,13 @@ func TestUpdateModelStatus(t *testing.T) { }) } } +*/ +// TODO: This test requires refactoring for the new storage architecture. +// It tests internal implementation details (addModelVersionIfNotExists) that work directly +// with LocalSchedulerStore. The new architecture uses Storage interfaces and may need +// a different approach to testing these internal methods. +/* func TestAddModelVersionIfNotExists(t *testing.T) { g := NewGomegaWithT(t) @@ -1785,10 +1883,12 @@ func TestAddModelVersionIfNotExists(t *testing.T) { func TestAddServerReplica(t *testing.T) { g := NewGomegaWithT(t) + ctx := context.Background() type test struct { name string - store *LocalSchedulerStore + models []*db.Model + servers []*db.Server req *agent.AgentSubscribeRequest expectedSnapshot []*ServerSnapshot expectedModelEvents int64 @@ -1797,18 +1897,17 @@ func TestAddServerReplica(t *testing.T) { tests := []test{ { - name: "AddServerReplica - existing server", - store: &LocalSchedulerStore{ - servers: map[string]*Server{ - "server1": { - name: "server1", - replicas: map[int]*ServerReplica{ - 0: {}, - 1: {}, - }, - expectedReplicas: 3, - shared: true, + name: "AddServerReplica - existing server", + models: []*db.Model{}, + servers: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: {}, + 1: {}, }, + ExpectedReplicas: 3, + Shared: true, }, }, req: &agent.AgentSubscribeRequest{ @@ -1832,10 +1931,9 @@ func TestAddServerReplica(t *testing.T) { expectedServerEvents: 1, }, { - name: "AddServerReplica - new server", - store: &LocalSchedulerStore{ - servers: map[string]*Server{}, - }, + name: "AddServerReplica - new server", + models: []*db.Model{}, + servers: []*db.Server{}, req: &agent.AgentSubscribeRequest{ ServerName: "server1", ReplicaIdx: 0, @@ -1855,11 +1953,9 @@ func TestAddServerReplica(t *testing.T) { expectedServerEvents: 1, }, { - name: "AddServerReplica - with loaded models", - store: &LocalSchedulerStore{ - servers: map[string]*Server{}, - models: map[string]*Model{}, - }, + name: "AddServerReplica - with loaded models", + models: []*db.Model{}, + servers: []*db.Server{}, req: &agent.AgentSubscribeRequest{ ServerName: "server1", ReplicaIdx: 0, @@ -1901,7 +1997,23 @@ func TestAddServerReplica(t *testing.T) { logger := log.New() eventHub, err := coordinator.NewEventHub(logger) g.Expect(err).To(BeNil()) - ms := NewMemoryStore(logger, test.store, eventHub) + + // Create storage instances + modelStorage := NewInMemoryStorage[*db.Model]() + serverStorage := NewInMemoryStorage[*db.Server]() + + // Populate storage with test data + for _, model := range test.models { + err := modelStorage.Insert(ctx, model) + g.Expect(err).To(BeNil()) + } + for _, server := range test.servers { + err := serverStorage.Insert(ctx, server) + g.Expect(err).To(BeNil()) + } + + // Create MemoryStore with populated storage + ms := NewMemoryStore(logger, modelStorage, serverStorage, eventHub) // register a callback to check if the event is triggered serverEvents := int64(0) @@ -1940,10 +2052,12 @@ func TestAddServerReplica(t *testing.T) { func TestRemoveServerReplica(t *testing.T) { g := NewGomegaWithT(t) + ctx := context.Background() type test struct { name string - store *LocalSchedulerStore + models []*db.Model + servers []*db.Server serverName string replicaIdx int serverExists bool @@ -1952,18 +2066,19 @@ func TestRemoveServerReplica(t *testing.T) { tests := []test{ { - name: "ReplicaRemovedButNotDeleted", - store: &LocalSchedulerStore{ - servers: map[string]*Server{ - "server1": { - name: "server1", - replicas: map[int]*ServerReplica{ - 0: {loadedModels: map[ModelVersionID]bool{{Name: "model1", Version: 1}: true}}, - 1: {}, + name: "ReplicaRemovedButNotDeleted", + models: []*db.Model{}, + servers: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: { + LoadedModels: []*db.ModelVersionID{{Name: "model1", Version: 1}}, }, - expectedReplicas: 2, - shared: true, + 1: {}, }, + ExpectedReplicas: 2, + Shared: true, }, }, serverName: "server1", @@ -1973,26 +2088,28 @@ func TestRemoveServerReplica(t *testing.T) { }, { name: "ReplicaRemovedAndDeleted", - store: &LocalSchedulerStore{ - models: map[string]*Model{ - "model1": { - versions: []*ModelVersion{ - { - version: 1, - }, + models: []*db.Model{ + { + Name: "model1", + Versions: []*db.ModelVersion{ + { + Version: 1, + Replicas: make(map[int32]*db.ReplicaStatus), }, }, }, - servers: map[string]*Server{ - "server1": { - name: "server1", - replicas: map[int]*ServerReplica{ - 0: {loadedModels: map[ModelVersionID]bool{{Name: "model1", Version: 1}: true}}, - 1: {}, + }, + servers: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: { + LoadedModels: []*db.ModelVersionID{{Name: "model1", Version: 1}}, }, - expectedReplicas: -1, - shared: true, + 1: {}, }, + ExpectedReplicas: -1, + Shared: true, }, }, serverName: "server1", @@ -2002,25 +2119,27 @@ func TestRemoveServerReplica(t *testing.T) { }, { name: "ReplicaRemovedAndServerDeleted", - store: &LocalSchedulerStore{ - models: map[string]*Model{ - "model1": { - versions: []*ModelVersion{ - { - version: 1, - }, + models: []*db.Model{ + { + Name: "model1", + Versions: []*db.ModelVersion{ + { + Version: 1, + Replicas: make(map[int32]*db.ReplicaStatus), }, }, }, - servers: map[string]*Server{ - "server1": { - name: "server1", - replicas: map[int]*ServerReplica{ - 0: {loadedModels: map[ModelVersionID]bool{{Name: "model1", Version: 1}: true}}, + }, + servers: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: { + LoadedModels: []*db.ModelVersionID{{Name: "model1", Version: 1}}, }, - expectedReplicas: 0, - shared: true, }, + ExpectedReplicas: 0, + Shared: true, }, }, serverName: "server1", @@ -2029,17 +2148,18 @@ func TestRemoveServerReplica(t *testing.T) { modelsReturned: 1, }, { - name: "ReplicaRemovedAndServerDeleted but no model version in store", - store: &LocalSchedulerStore{ - servers: map[string]*Server{ - "server1": { - name: "server1", - replicas: map[int]*ServerReplica{ - 0: {loadedModels: map[ModelVersionID]bool{{Name: "model1", Version: 1}: true}}, + name: "ReplicaRemovedAndServerDeleted but no model version in store", + models: []*db.Model{}, + servers: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: { + LoadedModels: []*db.ModelVersionID{{Name: "model1", Version: 1}}, }, - expectedReplicas: 0, - shared: true, }, + ExpectedReplicas: 0, + Shared: true, }, }, serverName: "server1", @@ -2049,36 +2169,40 @@ func TestRemoveServerReplica(t *testing.T) { }, { name: "ReplicaRemovedAndDeleted - loading models", - store: &LocalSchedulerStore{ - models: map[string]*Model{ - "model1": { - versions: []*ModelVersion{ - { - version: 1, - }, + models: []*db.Model{ + { + Name: "model1", + Versions: []*db.ModelVersion{ + { + Version: 1, + Replicas: make(map[int32]*db.ReplicaStatus), }, }, - "model2": { - versions: []*ModelVersion{ - { - version: 1, - }, + }, + { + Name: "model2", + Versions: []*db.ModelVersion{ + { + Version: 1, + Replicas: make(map[int32]*db.ReplicaStatus), }, }, }, - servers: map[string]*Server{ - "server1": { - name: "server1", - replicas: map[int]*ServerReplica{ - 0: { - loadedModels: map[ModelVersionID]bool{{Name: "model1", Version: 1}: true}, - loadingModels: map[ModelVersionID]bool{{Name: "model2", Version: 1}: true}, + }, + servers: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: { + LoadedModels: []*db.ModelVersionID{ + {Name: "model1", Version: 1}, + {Name: "model2", Version: 1}, }, - 1: {}, }, - expectedReplicas: -1, - shared: true, + 1: {}, }, + ExpectedReplicas: -1, + Shared: true, }, }, serverName: "server1", @@ -2088,32 +2212,35 @@ func TestRemoveServerReplica(t *testing.T) { }, { name: "ReplicaRemovedAndDeleted - non latest models", - store: &LocalSchedulerStore{ - models: map[string]*Model{ - "model1": { - versions: []*ModelVersion{ - { - version: 1, - replicas: map[int]ReplicaStatus{0: {State: Loaded}}, + models: []*db.Model{ + { + Name: "model1", + Versions: []*db.ModelVersion{ + { + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Loaded}, }, - { - version: 2, - replicas: map[int]ReplicaStatus{0: {State: LoadFailed}}, + }, + { + Version: 2, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_LoadFailed}, }, }, }, }, - servers: map[string]*Server{ - "server1": { - name: "server1", - replicas: map[int]*ServerReplica{ - 0: { - loadedModels: map[ModelVersionID]bool{{Name: "model1", Version: 1}: true}, - }, + }, + servers: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: { + LoadedModels: []*db.ModelVersionID{{Name: "model1", Version: 1}}, }, - expectedReplicas: -1, - shared: true, }, + ExpectedReplicas: -1, + Shared: true, }, }, serverName: "server1", @@ -2128,11 +2255,28 @@ func TestRemoveServerReplica(t *testing.T) { logger := log.New() eventHub, err := coordinator.NewEventHub(logger) g.Expect(err).To(BeNil()) - ms := NewMemoryStore(logger, test.store, eventHub) + + // Create storage instances + modelStorage := NewInMemoryStorage[*db.Model]() + serverStorage := NewInMemoryStorage[*db.Server]() + + // Populate storage with test data + for _, model := range test.models { + err := modelStorage.Insert(ctx, model) + g.Expect(err).To(BeNil()) + } + for _, server := range test.servers { + err := serverStorage.Insert(ctx, server) + g.Expect(err).To(BeNil()) + } + + // Create MemoryStore with populated storage + ms := NewMemoryStore(logger, modelStorage, serverStorage, eventHub) + models, err := ms.RemoveServerReplica(test.serverName, test.replicaIdx) g.Expect(err).To(BeNil()) g.Expect(test.modelsReturned).To(Equal(len(models))) - server, err := ms.GetServer(test.serverName, false, true) + server, _, err := ms.GetServer(test.serverName, false, true) if test.serverExists { g.Expect(err).To(BeNil()) g.Expect(server).ToNot(BeNil()) @@ -2146,10 +2290,12 @@ func TestRemoveServerReplica(t *testing.T) { func TestDrainServerReplica(t *testing.T) { g := NewGomegaWithT(t) + ctx := context.Background() type test struct { name string - store *LocalSchedulerStore + models []*db.Model + servers []*db.Server serverName string replicaIdx int modelsReturned []string @@ -2159,17 +2305,15 @@ func TestDrainServerReplica(t *testing.T) { tests := []test{ { name: "ReplicaSetDrainingNoModels", - store: &LocalSchedulerStore{ - servers: map[string]*Server{ - "server1": { - name: "server1", - replicas: map[int]*ServerReplica{ - 0: {}, - 1: {}, - }, - expectedReplicas: 2, - shared: true, + servers: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: {}, + 1: {}, }, + ExpectedReplicas: 2, + Shared: true, }, }, serverName: "server1", @@ -2178,27 +2322,26 @@ func TestDrainServerReplica(t *testing.T) { }, { name: "ReplicaSetDrainingWithLoadedModels", - store: &LocalSchedulerStore{ - models: map[string]*Model{ - "model1": { - versions: []*ModelVersion{ - { - version: 1, - replicas: map[int]ReplicaStatus{0: {State: Loaded}}, - }, + models: []*db.Model{ + { + Name: "model1", + Versions: []*db.ModelVersion{ + { + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{0: {State: db.ModelReplicaState_Loaded}}, }, }, }, - servers: map[string]*Server{ - "server1": { - name: "server1", - replicas: map[int]*ServerReplica{ - 0: {loadedModels: map[ModelVersionID]bool{{Name: "model1", Version: 1}: true}}, - 1: {}, - }, - expectedReplicas: -1, - shared: true, + }, + servers: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: {LoadedModels: []*db.ModelVersionID{{Name: "model1", Version: 1}}}, + 1: {}, }, + ExpectedReplicas: -1, + Shared: true, }, }, serverName: "server1", @@ -2207,43 +2350,44 @@ func TestDrainServerReplica(t *testing.T) { }, { name: "ReplicaSetDrainingWithLoadedAndLoadingModels", - store: &LocalSchedulerStore{ - models: map[string]*Model{ - "model1": { - versions: []*ModelVersion{ - { - version: 1, - replicas: map[int]ReplicaStatus{ - 0: {State: Loaded}}, - }, + models: []*db.Model{ + { + Name: "model1", + Versions: []*db.ModelVersion{ + { + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Loaded}}, }, }, - "model2": { - versions: []*ModelVersion{ - { - version: 1, - replicas: map[int]ReplicaStatus{ - 0: {State: Loading}}, - }, + }, + { + Name: "model2", + Versions: []*db.ModelVersion{ + { + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Loading}}, }, }, }, - servers: map[string]*Server{ - "server1": { - name: "server1", - replicas: map[int]*ServerReplica{ - 0: {loadedModels: map[ModelVersionID]bool{ - {Name: "model1", Version: 1}: true, + }, + servers: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: { + LoadedModels: []*db.ModelVersionID{ + {Name: "model1", Version: 1}, }, - loadingModels: map[ModelVersionID]bool{ - {Name: "model2", Version: 1}: true, - }, + LoadingModels: []*db.ModelVersionID{ + {Name: "model2", Version: 1}, }, - 1: {}, }, - expectedReplicas: -1, - shared: true, + 1: {}, }, + ExpectedReplicas: -1, + Shared: true, }, }, serverName: "server1", @@ -2257,22 +2401,40 @@ func TestDrainServerReplica(t *testing.T) { logger := log.New() eventHub, err := coordinator.NewEventHub(logger) g.Expect(err).To(BeNil()) - ms := NewMemoryStore(logger, test.store, eventHub) + + // Create storage instances + modelStorage := NewInMemoryStorage[*db.Model]() + serverStorage := NewInMemoryStorage[*db.Server]() + + // Populate storage with test data + for _, model := range test.models { + err := modelStorage.Insert(ctx, model) + g.Expect(err).To(BeNil()) + } + for _, server := range test.servers { + err := serverStorage.Insert(ctx, server) + g.Expect(err).To(BeNil()) + } + + // Create MemoryStore with populated storage + ms := NewMemoryStore(logger, modelStorage, serverStorage, eventHub) + models, err := ms.DrainServerReplica(test.serverName, test.replicaIdx) g.Expect(err).To(BeNil()) g.Expect(test.modelsReturned).To(Equal(models)) - server, err := ms.GetServer(test.serverName, false, true) + server, _, err := ms.GetServer(test.serverName, false, true) g.Expect(err).To(BeNil()) g.Expect(server).ToNot(BeNil()) - g.Expect(server.Replicas[test.replicaIdx].GetIsDraining()).To(BeTrue()) + g.Expect(server.Replicas[int32(test.replicaIdx)].GetIsDraining()).To(BeTrue()) if test.modelsReturned != nil { for _, model := range test.modelsReturned { - modelVersion, _ := ms.GetModel(model) - state := modelVersion.GetLatest().GetModelReplicaState(test.replicaIdx) - g.Expect(state).To(Equal(Draining)) + dbModel, _ := ms.GetModel(model) + state := dbModel.Latest().GetModelReplicaState(test.replicaIdx) + g.Expect(state).To(Equal(db.ModelReplicaState_Draining)) } } }) } } +*/ From 1b9bc95833e7224d915bc00a4189ea085b42c4c5 Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Fri, 9 Jan 2026 10:48:07 +0000 Subject: [PATCH 23/50] nit --- scheduler/pkg/store/memory.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scheduler/pkg/store/memory.go b/scheduler/pkg/store/memory.go index cb089c16e8..7e5bdbdbee 100644 --- a/scheduler/pkg/store/memory.go +++ b/scheduler/pkg/store/memory.go @@ -348,7 +348,7 @@ func (m *ModelServerStore) getModelServer( } modelVersion := model.GetVersion(version) if modelVersion == nil { - return nil, nil, nil, fmt.Errorf("Version not found for model %s, version %d", modelKey, version) + return nil, nil, nil, fmt.Errorf("version not found for model %s, version %d", modelKey, version) } server, err := m.store.servers.Get(context.TODO(), serverKey) if err != nil { From f413f512fe3cd44ddaa787ddf5e31b80aa9e79e4 Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Fri, 9 Jan 2026 11:19:40 +0000 Subject: [PATCH 24/50] wip fix tests --- scheduler/pkg/store/memory_status_test.go | 5 +++++ scheduler/pkg/store/memory_test.go | 7 +++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/scheduler/pkg/store/memory_status_test.go b/scheduler/pkg/store/memory_status_test.go index d73cb23b1a..93a1cbf272 100644 --- a/scheduler/pkg/store/memory_status_test.go +++ b/scheduler/pkg/store/memory_status_test.go @@ -56,6 +56,7 @@ func TestUpdateStatus(t *testing.T) { }, }, Server: "server2", + State: &db.ModelStatus{}, Replicas: map[int32]*db.ReplicaStatus{ 0: {State: db.ModelReplicaState_Loaded}, }, @@ -72,6 +73,7 @@ func TestUpdateStatus(t *testing.T) { }, }, Server: "server1", + State: &db.ModelStatus{}, Replicas: map[int32]*db.ReplicaStatus{ 0: {State: db.ModelReplicaState_Available}, }, @@ -116,6 +118,7 @@ func TestUpdateStatus(t *testing.T) { Replicas: 1, }, }, + State: &db.ModelStatus{}, Server: "server2", Replicas: map[int32]*db.ReplicaStatus{ 0: {State: db.ModelReplicaState_Loaded}, @@ -133,6 +136,7 @@ func TestUpdateStatus(t *testing.T) { MinReplicas: 1, }, }, + State: &db.ModelStatus{}, Server: "server1", Replicas: map[int32]*db.ReplicaStatus{ 0: {State: db.ModelReplicaState_Available}, @@ -196,6 +200,7 @@ func TestUpdateStatus(t *testing.T) { }, }, Server: "server1", + State: &db.ModelStatus{}, Replicas: map[int32]*db.ReplicaStatus{ 0: {State: db.ModelReplicaState_Available}, }, diff --git a/scheduler/pkg/store/memory_test.go b/scheduler/pkg/store/memory_test.go index 3e0b2ca8aa..d495ca60eb 100644 --- a/scheduler/pkg/store/memory_test.go +++ b/scheduler/pkg/store/memory_test.go @@ -399,7 +399,7 @@ func TestGetServer(t *testing.T) { 0: { LoadedModels: []*db.ModelVersionID{{ Version: 1, - Name: "model:1", + Name: "model", }, }, }, @@ -515,7 +515,10 @@ func TestGetServer(t *testing.T) { g.Expect(server.Name).To(Equal(test.expected.Name)) g.Expect(server.ExpectedReplicas).To(Equal(test.expected.ExpectedReplicas)) for k, v := range server.Replicas { - g.Expect(v.LoadedModels).To(Equal(test.expected.Replicas[k].LoadedModels)) + g.Expect(len(v.LoadedModels)).To(Equal(len(test.expected.Replicas[k].LoadedModels))) + for i, m := range v.LoadedModels { + g.Expect(proto.Equal(m, test.expected.Replicas[k].LoadedModels[i])).To(BeTrue()) + } } } else { g.Expect(err).ToNot(BeNil()) From 773d291cd40724437aca4e954326fde7007fa218 Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Fri, 9 Jan 2026 13:39:49 +0000 Subject: [PATCH 25/50] wip: fixing tests --- scheduler/cmd/scheduler/main.go | 2 +- scheduler/pkg/envoy/processor/incremental.go | 8 +- .../processor/incremental_benchmark_test.go | 4 +- .../pkg/envoy/processor/incremental_test.go | 10 +- scheduler/pkg/envoy/processor/server_test.go | 2 +- scheduler/pkg/kafka/dataflow/server_test.go | 2 +- scheduler/pkg/scheduler/scheduler.go | 2 +- scheduler/pkg/server/control_plane_test.go | 6 +- scheduler/pkg/server/server.go | 6 +- scheduler/pkg/server/server_status.go | 6 +- scheduler/pkg/server/server_status_test.go | 16 +- scheduler/pkg/server/server_test.go | 8 +- scheduler/pkg/store/api.go | 24 +- scheduler/pkg/store/memory.go | 6 +- scheduler/pkg/store/memory_status_test.go | 2 +- scheduler/pkg/store/memory_test.go | 836 +++++++++--------- scheduler/pkg/store/test_memory_hack.go | 2 +- 17 files changed, 465 insertions(+), 477 deletions(-) diff --git a/scheduler/cmd/scheduler/main.go b/scheduler/cmd/scheduler/main.go index 9503482081..b1db2df665 100644 --- a/scheduler/cmd/scheduler/main.go +++ b/scheduler/cmd/scheduler/main.go @@ -299,7 +299,7 @@ func main() { } // Create stores - ss := store.NewMemoryStore(logger, store.NewInMemoryStorage[*db.Model](), store.NewInMemoryStorage[*db.Server](), eventHub) + ss := store.NewModelServerStore(logger, store.NewInMemoryStorage[*db.Model](), store.NewInMemoryStorage[*db.Server](), eventHub) ps := pipeline.NewPipelineStore(logger, eventHub, ss) es := experiment.NewExperimentServer(logger, eventHub, ss, ps) cleaner := cleaner.NewVersionCleaner(ss, logger) diff --git a/scheduler/pkg/envoy/processor/incremental.go b/scheduler/pkg/envoy/processor/incremental.go index 874b33686d..784540df65 100644 --- a/scheduler/pkg/envoy/processor/incremental.go +++ b/scheduler/pkg/envoy/processor/incremental.go @@ -258,7 +258,7 @@ func (p *IncrementalProcessor) addModelTraffic(routeName string, model *db.Model p.modelStore.LockServer(latestModel.Server) defer p.modelStore.UnlockServer(latestModel.Server) - server, _, err := p.modelStore.GetServer(latestModel.Server, false, false) + server, _, err := p.modelStore.GetServer(latestModel.Server, false) if err != nil { return err } @@ -266,7 +266,7 @@ func (p *IncrementalProcessor) addModelTraffic(routeName string, model *db.Model lastAvailableModelVersion := model.GetLastAvailableModel() if lastAvailableModelVersion != nil && latestModel.GetVersion() != lastAvailableModelVersion.GetVersion() { trafficLatestModel, trafficLastAvailableModel := getTrafficShare(latestModel, lastAvailableModelVersion, weight) - lastAvailableServer, _, err := p.modelStore.GetServer(lastAvailableModelVersion.Server, false, false) + lastAvailableServer, _, err := p.modelStore.GetServer(lastAvailableModelVersion.Server, false) if err != nil { logger.WithError(err).Errorf("Failed to find server %s for last available model %s", lastAvailableModelVersion.Server, modelName) return err @@ -555,7 +555,7 @@ func (p *IncrementalProcessor) modelUpdate(modelName string) error { } if !modelRemoved { - _, _, err = p.modelStore.GetServer(latestModel.Server, false, false) + _, _, err = p.modelStore.GetServer(latestModel.Server, false) if err != nil { logger.Debugf("sync: No server - removing for %s", modelName) if err := p.removeRouteForServerInEnvoyCache(modelName); err != nil { @@ -674,7 +674,7 @@ func (p *IncrementalProcessor) modelSync() { } p.modelStore.LockServer(v.Server) - s, _, err := p.modelStore.GetServer(v.Server, false, false) + s, _, err := p.modelStore.GetServer(v.Server, false) if err != nil { logger.Debugf("Failed to get server for model %s server %s", mv.name, v.Server) p.modelStore.UnlockServer(v.Server) diff --git a/scheduler/pkg/envoy/processor/incremental_benchmark_test.go b/scheduler/pkg/envoy/processor/incremental_benchmark_test.go index 653195ecb6..1480de30b8 100644 --- a/scheduler/pkg/envoy/processor/incremental_benchmark_test.go +++ b/scheduler/pkg/envoy/processor/incremental_benchmark_test.go @@ -69,7 +69,7 @@ func addModel( require.NoError(b, err) // Schedule model - server, err := ip.modelStore.GetServer(serverName, true, false) + server, _, err := ip.modelStore.GetServer(serverName, false) require.NoError(b, err) replicas := []*store.ServerReplica{} @@ -122,7 +122,7 @@ func benchmarkModelUpdate( eventHub, err := coordinator.NewEventHub(logger) require.NoError(b, err) - memoryStore := store.NewMemoryStore(logger, store.NewLocalSchedulerStore(), eventHub) + memoryStore := store.NewModelServerStore(logger, store.NewLocalSchedulerStore(), eventHub) pipelineStore := pipeline.NewPipelineStore(logger, eventHub, memoryStore) ip, err := NewIncrementalProcessor( "some node", diff --git a/scheduler/pkg/envoy/processor/incremental_test.go b/scheduler/pkg/envoy/processor/incremental_test.go index c2b1989265..091786572c 100644 --- a/scheduler/pkg/envoy/processor/incremental_test.go +++ b/scheduler/pkg/envoy/processor/incremental_test.go @@ -342,7 +342,7 @@ func TestRollingUpdate(t *testing.T) { } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - modelStore := store.NewMemoryStore(log.New(), store.NewLocalSchedulerStore(), nil) + modelStore := store.NewModelServerStore(log.New(), store.NewLocalSchedulerStore(), nil) xdsCache, err := xdscache.NewSeldonXDSCache(log.New(), &xdscache.PipelineGatewayDetails{Host: "pipeline", GrpcPort: 1, HttpPort: 2}, nil) g.Expect(err).To(BeNil()) inc := &IncrementalProcessor{ @@ -420,7 +420,7 @@ func TestDraining(t *testing.T) { } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - modelStore := store.NewMemoryStore(log.New(), store.NewLocalSchedulerStore(), nil) + modelStore := store.NewModelServerStore(log.New(), store.NewLocalSchedulerStore(), nil) xdsCache, err := xdscache.NewSeldonXDSCache(log.New(), &xdscache.PipelineGatewayDetails{Host: "pipeline", GrpcPort: 1, HttpPort: 2}, nil) g.Expect(err).To(BeNil()) inc := &IncrementalProcessor{ @@ -564,7 +564,7 @@ func TestModelSync(t *testing.T) { } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - modelStore := store.NewMemoryStore(log.New(), store.NewLocalSchedulerStore(), nil) + modelStore := store.NewModelServerStore(log.New(), store.NewLocalSchedulerStore(), nil) xdsCache, err := xdscache.NewSeldonXDSCache(log.New(), &xdscache.PipelineGatewayDetails{Host: "pipeline", GrpcPort: 1, HttpPort: 2}, nil) g.Expect(err).To(BeNil()) inc := &IncrementalProcessor{ @@ -808,7 +808,7 @@ func TestEnvoySettings(t *testing.T) { t.Run(test.name, func(t *testing.T) { logger := log.New() eventHub, _ := coordinator.NewEventHub(logger) - memoryStore := store.NewMemoryStore(log.New(), store.NewLocalSchedulerStore(), eventHub) + memoryStore := store.NewModelServerStore(log.New(), store.NewLocalSchedulerStore(), eventHub) xdsCache, err := xdscache.NewSeldonXDSCache(log.New(), &xdscache.PipelineGatewayDetails{Host: "pipeline", GrpcPort: 1, HttpPort: 2}, nil) g.Expect(err).To(BeNil()) inc := &IncrementalProcessor{ @@ -1082,7 +1082,7 @@ func createTestModel(modelName string, var serverReplicas []*store.ServerReplica for _, replicaIdx := range replicas { var serverReplica *store.ServerReplica - server, err := inc.modelStore.GetServer(serverName, false, true) + server, _, err := inc.modelStore.GetServer(serverName, true) g.Expect(err).To(BeNil()) if server != nil { if sr, ok := server.Replicas[replicaIdx]; ok { diff --git a/scheduler/pkg/envoy/processor/server_test.go b/scheduler/pkg/envoy/processor/server_test.go index 0de114fd43..83c6931ed7 100644 --- a/scheduler/pkg/envoy/processor/server_test.go +++ b/scheduler/pkg/envoy/processor/server_test.go @@ -43,7 +43,7 @@ func TestFetch(t *testing.T) { logger := log.New() - memoryStore := store.NewMemoryStore(logger, store.NewLocalSchedulerStore(), nil) + memoryStore := store.NewModelServerStore(logger, store.NewLocalSchedulerStore(), nil) pipelineHandler := pipeline.NewPipelineStore(logger, nil, memoryStore) xdsCache, err := xdscache.NewSeldonXDSCache(log.New(), &xdscache.PipelineGatewayDetails{}, nil) diff --git a/scheduler/pkg/kafka/dataflow/server_test.go b/scheduler/pkg/kafka/dataflow/server_test.go index 92312a0752..f969a2bfa3 100644 --- a/scheduler/pkg/kafka/dataflow/server_test.go +++ b/scheduler/pkg/kafka/dataflow/server_test.go @@ -1674,7 +1674,7 @@ func createTestScheduler(t *testing.T, serverName string) (*ChainerServer, *coor eventHub, _ := coordinator.NewEventHub(logger) - schedulerStore := store.NewMemoryStore(logger, store.NewLocalSchedulerStore(), eventHub) + schedulerStore := store.NewModelServerStore(logger, store.NewLocalSchedulerStore(), eventHub) pipelineServer := pipeline.NewPipelineStore(logger, eventHub, schedulerStore) data := diff --git a/scheduler/pkg/scheduler/scheduler.go b/scheduler/pkg/scheduler/scheduler.go index 3a65180af3..98d2f0c025 100644 --- a/scheduler/pkg/scheduler/scheduler.go +++ b/scheduler/pkg/scheduler/scheduler.go @@ -196,7 +196,7 @@ func (s *SimpleScheduler) scheduleToServer(modelName string) (*coordinator.Serve var filteredServers []*db.Server // Get all servers - servers, err := s.store.GetServers(false, true) + servers, err := s.store.GetServers() if err != nil { return nil, err } diff --git a/scheduler/pkg/server/control_plane_test.go b/scheduler/pkg/server/control_plane_test.go index bd49f2a699..b3eed83971 100644 --- a/scheduler/pkg/server/control_plane_test.go +++ b/scheduler/pkg/server/control_plane_test.go @@ -47,7 +47,7 @@ func TestStartServerStream(t *testing.T) { name: "success - ok", ctx: context.Background(), server: &SchedulerServer{ - modelStore: store.NewMemoryStore(log.New(), store.NewLocalSchedulerStore(), nil), + modelStore: store.NewModelServerStore(log.New(), store.NewLocalSchedulerStore(), nil), logger: log.New(), timeout: 10 * time.Millisecond, }, @@ -56,7 +56,7 @@ func TestStartServerStream(t *testing.T) { name: "failure - stream ctx cancelled", ctx: cancellCtx, server: &SchedulerServer{ - modelStore: store.NewMemoryStore(log.New(), store.NewLocalSchedulerStore(), nil), + modelStore: store.NewModelServerStore(log.New(), store.NewLocalSchedulerStore(), nil), logger: log.New(), timeout: 10 * time.Millisecond, }, @@ -66,7 +66,7 @@ func TestStartServerStream(t *testing.T) { name: "failure - timeout", ctx: context.Background(), server: &SchedulerServer{ - modelStore: store.NewMemoryStore(log.New(), store.NewLocalSchedulerStore(), nil), + modelStore: store.NewModelServerStore(log.New(), store.NewLocalSchedulerStore(), nil), logger: log.New(), timeout: 1 * time.Millisecond, }, diff --git a/scheduler/pkg/server/server.go b/scheduler/pkg/server/server.go index ca2c9ac250..bd571812bf 100644 --- a/scheduler/pkg/server/server.go +++ b/scheduler/pkg/server/server.go @@ -473,7 +473,7 @@ func (s *SchedulerServer) rescheduleModels(serverKey string) { s.modelStore.LockServer(serverKey) defer s.modelStore.UnlockServer(serverKey) - server, _, err := s.modelStore.GetServer(serverKey, false, true) + server, _, err := s.modelStore.GetServer(serverKey, true) if err != nil { logger.WithError(err).Errorf("Failed to get server %s", serverKey) return @@ -653,7 +653,7 @@ func (s *SchedulerServer) ServerStatus( if req.Name == nil { // All servers requested - servers, err := s.modelStore.GetServers(true, true) + servers, err := s.modelStore.GetServers() if err != nil { return status.Errorf(codes.FailedPrecondition, "%s", err.Error()) } @@ -673,7 +673,7 @@ func (s *SchedulerServer) ServerStatus( return nil } else { // Single server requested - server, _, err := s.modelStore.GetServer(req.GetName(), true, true) + server, _, err := s.modelStore.GetServer(req.GetName(), true) if err != nil { return status.Errorf(codes.FailedPrecondition, "%s", err.Error()) } diff --git a/scheduler/pkg/server/server_status.go b/scheduler/pkg/server/server_status.go index c22098f02a..29dbf34760 100644 --- a/scheduler/pkg/server/server_status.go +++ b/scheduler/pkg/server/server_status.go @@ -714,7 +714,7 @@ func (s *SchedulerServer) handleServerEvents(event coordinator.ServerEventMsg) { }) logger.Info("Got server event") - server, stats, err := s.modelStore.GetServer(event.ServerName, true, true) + server, stats, err := s.modelStore.GetServer(event.ServerName, true) if err != nil { logger.WithError(err).Errorf("Failed to get server %s", event.ServerName) return @@ -799,7 +799,7 @@ func (s *SchedulerServer) sendServerStatus() { s.serverEventStream.mu.Lock() defer s.serverEventStream.mu.Unlock() for serverName := range pendingServers { - server, _, err := s.modelStore.GetServer(serverName, true, true) + server, _, err := s.modelStore.GetServer(serverName, true) if err != nil { logger.Errorf("Failed to get server %s", serverName) continue @@ -842,7 +842,7 @@ func (s *SchedulerServer) sendServerResponse(ssr *pb.ServerStatusResponse) { // initial send of server statuses to a new controller func (s *SchedulerServer) sendCurrentServerStatuses(stream pb.Scheduler_ServerStatusServer) error { - servers, err := s.modelStore.GetServers(true, true) // shallow, with model details + servers, err := s.modelStore.GetServers() // shallow, with model details if err != nil { return err } diff --git a/scheduler/pkg/server/server_status_test.go b/scheduler/pkg/server/server_status_test.go index 53abef0c62..cc8cacfea6 100644 --- a/scheduler/pkg/server/server_status_test.go +++ b/scheduler/pkg/server/server_status_test.go @@ -332,7 +332,7 @@ func TestModelsStatusStream(t *testing.T) { }, }, server: &SchedulerServer{ - modelStore: store.NewMemoryStore(log.New(), store.NewLocalSchedulerStore(), nil), + modelStore: store.NewModelServerStore(log.New(), store.NewLocalSchedulerStore(), nil), logger: log.New(), timeout: 10 * time.Millisecond, }, @@ -346,7 +346,7 @@ func TestModelsStatusStream(t *testing.T) { }, }, server: &SchedulerServer{ - modelStore: store.NewMemoryStore(log.New(), store.NewLocalSchedulerStore(), nil), + modelStore: store.NewModelServerStore(log.New(), store.NewLocalSchedulerStore(), nil), logger: log.New(), timeout: 10 * time.Millisecond, }, @@ -361,7 +361,7 @@ func TestModelsStatusStream(t *testing.T) { }, }, server: &SchedulerServer{ - modelStore: store.NewMemoryStore(log.New(), store.NewLocalSchedulerStore(), nil), + modelStore: store.NewModelServerStore(log.New(), store.NewLocalSchedulerStore(), nil), logger: log.New(), timeout: 1 * time.Millisecond, }, @@ -975,7 +975,7 @@ func TestServersStatusStream(t *testing.T) { }, }, server: &SchedulerServer{ - modelStore: store.NewMemoryStore(log.New(), store.NewLocalSchedulerStore(), nil), + modelStore: store.NewModelServerStore(log.New(), store.NewLocalSchedulerStore(), nil), logger: log.New(), timeout: 10 * time.Millisecond, }, @@ -1011,7 +1011,7 @@ func TestServersStatusStream(t *testing.T) { }, }, server: &SchedulerServer{ - modelStore: store.NewMemoryStore(log.New(), store.NewLocalSchedulerStore(), nil), + modelStore: store.NewModelServerStore(log.New(), store.NewLocalSchedulerStore(), nil), logger: log.New(), timeout: 10 * time.Millisecond, }, @@ -1048,7 +1048,7 @@ func TestServersStatusStream(t *testing.T) { }, }, server: &SchedulerServer{ - modelStore: store.NewMemoryStore(log.New(), store.NewLocalSchedulerStore(), nil), + modelStore: store.NewModelServerStore(log.New(), store.NewLocalSchedulerStore(), nil), logger: log.New(), timeout: 10 * time.Millisecond, }, @@ -1063,7 +1063,7 @@ func TestServersStatusStream(t *testing.T) { }, }, server: &SchedulerServer{ - modelStore: store.NewMemoryStore(log.New(), store.NewLocalSchedulerStore(), nil), + modelStore: store.NewModelServerStore(log.New(), store.NewLocalSchedulerStore(), nil), logger: log.New(), timeout: 1 * time.Millisecond, }, @@ -1083,7 +1083,7 @@ func TestServersStatusStream(t *testing.T) { expectedReplicas++ expectedNumLoadedModelReplicas += int32(len(r.request.LoadedModels)) } else { - server, _ := test.server.modelStore.GetServer("foo", true, false) + server, _, err := test.server.modelStore.GetServer("foo", false) server.Replicas[int(r.request.ReplicaIdx)].SetIsDraining() } } diff --git a/scheduler/pkg/server/server_test.go b/scheduler/pkg/server/server_test.go index 87bac5ec06..782ebd8da9 100644 --- a/scheduler/pkg/server/server_test.go +++ b/scheduler/pkg/server/server_test.go @@ -58,7 +58,7 @@ func TestLoadModel(t *testing.T) { eventHub, err := coordinator.NewEventHub(logger) g.Expect(err).To(BeNil()) - schedulerStore := store.NewMemoryStore(logger, store.NewLocalSchedulerStore(), eventHub) + schedulerStore := store.NewModelServerStore(logger, store.NewLocalSchedulerStore(), eventHub) experimentServer := experiment.NewExperimentServer(logger, eventHub, nil, nil) pipelineServer := pipeline.NewPipelineStore(logger, eventHub, schedulerStore) sync := synchroniser.NewSimpleSynchroniser(time.Duration(10 * time.Millisecond)) @@ -364,7 +364,7 @@ func TestUnloadModel(t *testing.T) { log.SetLevel(log.DebugLevel) eventHub, err := coordinator.NewEventHub(logger) g.Expect(err).To(BeNil()) - schedulerStore := store.NewMemoryStore(logger, store.NewLocalSchedulerStore(), eventHub) + schedulerStore := store.NewModelServerStore(logger, store.NewLocalSchedulerStore(), eventHub) experimentServer := experiment.NewExperimentServer(logger, eventHub, nil, nil) pipelineServer := pipeline.NewPipelineStore(logger, eventHub, schedulerStore) mockAgent := &mockAgentHandler{} @@ -710,7 +710,7 @@ func TestServerNotify(t *testing.T) { log.SetLevel(log.DebugLevel) eventHub, err := coordinator.NewEventHub(logger) g.Expect(err).To(BeNil()) - schedulerStore := store.NewMemoryStore(logger, store.NewLocalSchedulerStore(), eventHub) + schedulerStore := store.NewModelServerStore(logger, store.NewLocalSchedulerStore(), eventHub) sync := synchroniser.NewSimpleSynchroniser(time.Duration(10 * time.Millisecond)) scheduler := scheduler2.NewSimpleScheduler(logger, schedulerStore, @@ -811,7 +811,7 @@ func TestServerNotify(t *testing.T) { time.Sleep(50 * time.Millisecond) // allow events to be processed - actualServers, err := s.modelStore.GetServers(true, false) + actualServers, err := s.modelStore.GetServers() g.Expect(err).To(BeNil()) sort.Slice(actualServers, func(i, j int) bool { return actualServers[i].Name < actualServers[j].Name diff --git a/scheduler/pkg/store/api.go b/scheduler/pkg/store/api.go index a621d3e4ce..24b51c9a92 100644 --- a/scheduler/pkg/store/api.go +++ b/scheduler/pkg/store/api.go @@ -141,24 +141,24 @@ type ModelServerAPI interface { UpdateModel(config *pb.LoadModelRequest) error GetModel(key string) (*db.Model, error) GetModels() ([]*db.Model, error) - LockModel(modelId string) - UnlockModel(modelId string) - LockServer(serverID string) - UnlockServer(serverID string) + LockModel(modelName string) + UnlockModel(modelName string) + LockServer(serverName string) + UnlockServer(serverName string) RemoveModel(req *pb.UnloadModelRequest) error - GetServers(shallow bool, modelDetails bool) ([]*db.Server, error) - GetServer(serverKey string, shallow bool, modelDetails bool) (*db.Server, *ServerStats, error) - UpdateLoadedModels(modelKey string, version uint32, serverKey string, replicas []*db.ServerReplica) error - UnloadVersionModels(modelKey string, version uint32) (bool, error) - UnloadModelGwVersionModels(modelKey string, version uint32) (bool, error) - UpdateModelState(modelKey string, version uint32, serverKey string, replicaIdx int, availableMemory *uint64, expectedState, desiredState db.ModelReplicaState, reason string, runtimeInfo *pb.ModelRuntimeInfo) error + GetServers() ([]*db.Server, error) + GetServer(serverName string, modelDetails bool) (*db.Server, *ServerStats, error) + UpdateLoadedModels(modelName string, version uint32, serverKey string, replicas []*db.ServerReplica) error + UnloadVersionModels(modelName string, version uint32) (bool, error) + UnloadModelGwVersionModels(modelName string, version uint32) (bool, error) + UpdateModelState(modelName string, version uint32, serverKey string, replicaIdx int, availableMemory *uint64, expectedState, desiredState db.ModelReplicaState, reason string, runtimeInfo *pb.ModelRuntimeInfo) error AddServerReplica(request *pba.AgentSubscribeRequest) error ServerNotify(request *pb.ServerNotify) error RemoveServerReplica(serverName string, replicaIdx int) ([]string, error) // return previously loaded models DrainServerReplica(serverName string, replicaIdx int) ([]string, error) // return previously loaded models - FailedScheduling(modelID string, version uint32, reason string, reset bool) error + FailedScheduling(modelName string, version uint32, reason string, reset bool) error GetAllModels() ([]string, error) - SetModelGwModelState(name string, versionNumber uint32, status db.ModelState, reason string, source string) error + SetModelGwModelState(modelName string, versionNumber uint32, status db.ModelState, reason string, source string) error // TODO better name... should it even be on tihs interface? EmitEvents() error } diff --git a/scheduler/pkg/store/memory.go b/scheduler/pkg/store/memory.go index 7e5bdbdbee..afaa899ba4 100644 --- a/scheduler/pkg/store/memory.go +++ b/scheduler/pkg/store/memory.go @@ -39,7 +39,7 @@ type ModelServerStore struct { var _ ModelServerAPI = &ModelServerStore{} -func NewMemoryStore( +func NewModelServerStore( logger log.FieldLogger, modelStore Storage[*db.Model], serverStore Storage[*db.Server], @@ -289,13 +289,13 @@ func (m *ModelServerStore) removeModelImpl(req *pb.UnloadModelRequest) error { return err } -func (m *ModelServerStore) GetServers(shallow bool, modelDetails bool) ([]*db.Server, error) { +func (m *ModelServerStore) GetServers() ([]*db.Server, error) { m.mu.RLock() defer m.mu.RUnlock() return m.store.servers.List(context.TODO()) } -func (m *ModelServerStore) GetServer(serverKey string, _ bool, modelDetails bool) (*db.Server, *ServerStats, error) { +func (m *ModelServerStore) GetServer(serverKey string, modelDetails bool) (*db.Server, *ServerStats, error) { m.mu.RLock() defer m.mu.RUnlock() diff --git a/scheduler/pkg/store/memory_status_test.go b/scheduler/pkg/store/memory_status_test.go index 93a1cbf272..682b450be7 100644 --- a/scheduler/pkg/store/memory_status_test.go +++ b/scheduler/pkg/store/memory_status_test.go @@ -250,7 +250,7 @@ func TestUpdateStatus(t *testing.T) { } // Create MemoryStore with populated storage - ms := NewMemoryStore(logger, modelStorage, serverStorage, eventHub) + ms := NewModelServerStore(logger, modelStorage, serverStorage, eventHub) // Get model and version for testing model, modelVersion, _, err := ms.getModelServer(test.modelName, test.version, test.serverName) diff --git a/scheduler/pkg/store/memory_test.go b/scheduler/pkg/store/memory_test.go index d495ca60eb..6d366bc1d4 100644 --- a/scheduler/pkg/store/memory_test.go +++ b/scheduler/pkg/store/memory_test.go @@ -231,7 +231,7 @@ func TestUpdateModel(t *testing.T) { } // Create MemoryStore with populated storage - ms := NewMemoryStore(logger, modelStorage, serverStorage, eventHub) + ms := NewModelServerStore(logger, modelStorage, serverStorage, eventHub) err = ms.UpdateModel(test.loadModelReq) if test.err != nil { @@ -312,7 +312,7 @@ func TestGetModel(t *testing.T) { } // Create MemoryStore with populated storage - ms := NewMemoryStore(logger, modelStorage, serverStorage, eventHub) + ms := NewModelServerStore(logger, modelStorage, serverStorage, eventHub) model, err := ms.GetModel(test.key) if test.err == nil { @@ -507,9 +507,9 @@ func TestGetServer(t *testing.T) { } // Create MemoryStore with populated storage - ms := NewMemoryStore(logger, modelStorage, serverStorage, eventHub) + ms := NewModelServerStore(logger, modelStorage, serverStorage, eventHub) - server, _, err := ms.GetServer(test.key, false, true) + server, _, err := ms.GetServer(test.key, true) if !test.isErr { g.Expect(err).To(BeNil()) g.Expect(server.Name).To(Equal(test.expected.Name)) @@ -520,17 +520,9 @@ func TestGetServer(t *testing.T) { g.Expect(proto.Equal(m, test.expected.Replicas[k].LoadedModels[i])).To(BeTrue()) } } - } else { - g.Expect(err).ToNot(BeNil()) - } - - // no details - server, _, err = ms.GetServer(test.key, false, false) - if !test.isErr { - for _, v := range server.Replicas { - g.Expect(len(v.LoadedModels)).To(Equal(0)) - } + return } + g.Expect(err).ToNot(BeNil()) }) } } @@ -569,6 +561,7 @@ func TestRemoveModel(t *testing.T) { }, }, Replicas: make(map[int32]*db.ReplicaStatus), + State: &db.ModelStatus{}, }, }, }, @@ -599,23 +592,17 @@ func TestRemoveModel(t *testing.T) { } // Create MemoryStore with populated storage - ms := NewMemoryStore(logger, modelStorage, serverStorage, eventHub) - + ms := NewModelServerStore(logger, modelStorage, serverStorage, eventHub) err = ms.RemoveModel(&pb.UnloadModelRequest{Model: &pb.ModelReference{Name: test.key}}) if !test.err { g.Expect(err).To(BeNil()) - } else { - g.Expect(err).ToNot(BeNil()) + return } + g.Expect(err).ToNot(BeNil()) }) } } -// TODO: This test requires refactoring for the new storage architecture. -// It tests internal implementation details (updateLoadedModelsImpl) that work directly -// with LocalSchedulerStore. The new architecture uses Storage interfaces and may need -// a different approach to testing these internal methods. -/* func TestUpdateLoadedModels(t *testing.T) { g := NewGomegaWithT(t) ctx := context.Background() @@ -636,401 +623,379 @@ func TestUpdateLoadedModels(t *testing.T) { } tests := []test{ - { - name: "ModelVersionNotLatest", - store: &LocalSchedulerStore{ - models: map[string]*Model{"model": { - versions: []*ModelVersion{ - { - modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - server: "server", - version: 1, - replicas: map[int]ReplicaStatus{}, - }, - { - modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - server: "server", - version: 2, - replicas: map[int]ReplicaStatus{}, - }, - }, - }}, - servers: map[string]*Server{ - "server": { - name: "server", - }, - }, - }, - modelKey: "model", - version: 1, - serverKey: "server", - replicas: nil, - err: true, - }, - { - name: "UpdatedVersionsOK", - store: &LocalSchedulerStore{ - models: map[string]*Model{"model": { - versions: []*ModelVersion{ - { - modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - server: "server", - version: 1, - replicas: map[int]ReplicaStatus{}, - }, - }, - }}, - servers: map[string]*Server{ - "server": { - name: "server", - replicas: map[int]*ServerReplica{ - 0: {}, - 1: {}, - }, - }, - }, - }, - modelKey: "model", - version: 1, - serverKey: "server", - replicas: []*ServerReplica{ - {replicaIdx: 0}, {replicaIdx: 1}, - }, - expectedStates: map[int]ReplicaStatus{0: {State: LoadRequested}, 1: {State: LoadRequested}}, - }, - { - name: "WithAlreadyLoadedModels", - store: &LocalSchedulerStore{ - models: map[string]*Model{"model": { - versions: []*ModelVersion{ - { - modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - server: "server", - version: 1, - replicas: map[int]ReplicaStatus{ - 0: {State: Loaded}, - }, - }, - }, - }}, - servers: map[string]*Server{ - "server": { - name: "server", - replicas: map[int]*ServerReplica{ - 0: {}, - 1: {}, - }, - }, - }, - }, - modelKey: "model", - version: 1, - serverKey: "server", - replicas: []*ServerReplica{ - {replicaIdx: 0}, {replicaIdx: 1}, - }, - expectedStates: map[int]ReplicaStatus{0: {State: Loaded}, 1: {State: LoadRequested}}, - }, - { - name: "UnloadModelsNotSelected", - store: &LocalSchedulerStore{ - models: map[string]*Model{"model": { - versions: []*ModelVersion{ - { - modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - server: "server", - version: 1, - replicas: map[int]ReplicaStatus{ - 0: {State: Loaded}, - }, - }, - }, - }}, - servers: map[string]*Server{ - "server": { - name: "server", - replicas: map[int]*ServerReplica{ - 0: {}, - 1: {}, - }, - }, - }, - }, - modelKey: "model", - version: 1, - serverKey: "server", - replicas: []*ServerReplica{ - {replicaIdx: 1}, - }, - expectedStates: map[int]ReplicaStatus{0: {State: UnloadEnvoyRequested}, 1: {State: LoadRequested}}, - }, - { - name: "DeletedModel", - store: &LocalSchedulerStore{ - models: map[string]*Model{"model": { - versions: []*ModelVersion{ - { - modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - server: "server", - version: 1, - replicas: map[int]ReplicaStatus{ - 0: {State: Loaded}, - 1: {State: Loading}, - }, - }, - }, - }}, - servers: map[string]*Server{ - "server": { - name: "server", - replicas: map[int]*ServerReplica{ - 0: {}, - 1: {}, - }, - }, - }, - }, - modelKey: "model", - version: 1, - serverKey: "server", - replicas: []*ServerReplica{}, - isModelDeleted: true, - expectedStates: map[int]ReplicaStatus{0: {State: UnloadEnvoyRequested}, 1: {State: UnloadEnvoyRequested}}, - }, - { - name: "DeletedModelNoReplicas", - store: &LocalSchedulerStore{ - models: map[string]*Model{"model": { - versions: []*ModelVersion{ - { - modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - server: "server", - version: 1, - replicas: map[int]ReplicaStatus{ - 0: {State: Unloaded}, - }, - }, - }, - }}, - servers: map[string]*Server{ - "server": { - name: "server", - replicas: map[int]*ServerReplica{ - 0: {}, - 1: {}, - }, - }, - }, - }, - modelKey: "model", - version: 1, - serverKey: "server", - replicas: []*ServerReplica{}, - isModelDeleted: true, - expectedStates: map[int]ReplicaStatus{0: {State: Unloaded}}, - }, - { - name: "ServerChanged", - store: &LocalSchedulerStore{ - models: map[string]*Model{"model": { - versions: []*ModelVersion{ - { - modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - server: "server1", - version: 1, - replicas: map[int]ReplicaStatus{}, - }, - }, - }}, - servers: map[string]*Server{ - "server1": { - name: "server1", - replicas: map[int]*ServerReplica{ - 0: {}, - 1: {}, - }, - }, - "server2": { - name: "server2", - replicas: map[int]*ServerReplica{ - 0: {}, - 1: {}, - }, - }, - }, - }, - modelKey: "model", - version: 1, - serverKey: "server2", - replicas: []*ServerReplica{ - {replicaIdx: 0}, {replicaIdx: 1}, - }, - expectedStates: map[int]ReplicaStatus{0: {State: LoadRequested}, 1: {State: LoadRequested}}, - }, - { - name: "WithDrainingServerReplicaSameServer", - store: &LocalSchedulerStore{ - models: map[string]*Model{"model": { - versions: []*ModelVersion{ - { - modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - server: "server", - version: 1, - replicas: map[int]ReplicaStatus{ - 0: {State: Draining}, - }, - }, - }, - }}, - servers: map[string]*Server{ - "server": { - name: "server", - replicas: map[int]*ServerReplica{ - 0: {isDraining: true}, - 1: {}, - }, - }, - }, - }, - modelKey: "model", - version: 1, - serverKey: "server", - replicas: []*ServerReplica{ - {replicaIdx: 1}, - }, - expectedStates: map[int]ReplicaStatus{0: {State: Draining}, 1: {State: LoadRequested}}, - }, - { - name: "WithDrainingServerReplicaNewServer", - store: &LocalSchedulerStore{ - models: map[string]*Model{"model": { - versions: []*ModelVersion{ - { - modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - server: "server1", - version: 1, - replicas: map[int]ReplicaStatus{ - 0: {State: Draining}, - }, - }, - }, - }}, - servers: map[string]*Server{ - "server1": { - name: "server1", - replicas: map[int]*ServerReplica{ - 0: {isDraining: true}, - }, - }, - "server2": { - name: "server2", - replicas: map[int]*ServerReplica{ - 0: {}, - }, - }, - }, - }, - modelKey: "model", - version: 1, - serverKey: "server2", - replicas: []*ServerReplica{ - {replicaIdx: 0}, - }, - expectedStates: map[int]ReplicaStatus{0: {State: LoadRequested}}, - }, - { - name: "DeleteFailedScheduleModel", - store: &LocalSchedulerStore{ - models: map[string]*Model{"model": { - versions: []*ModelVersion{ - { - modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - server: "", - version: 1, - replicas: map[int]ReplicaStatus{}, - }, - }, - }}, - servers: map[string]*Server{}, - }, - modelKey: "model", - version: 1, - serverKey: "", - replicas: []*ServerReplica{}, - isModelDeleted: true, - expectedStates: map[int]ReplicaStatus{}, - }, - { - name: "ProgressModelLoading", - store: &LocalSchedulerStore{ - models: map[string]*Model{"model": { - versions: []*ModelVersion{ - { - modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 1}}, - server: "server", - version: 1, - replicas: map[int]ReplicaStatus{ - 0: {State: Available}, - 1: {State: Unloaded}, - }, - state: ModelStatus{State: ModelProgressing}, - }, - }, - }}, - servers: map[string]*Server{ - "server": { - name: "server", - replicas: map[int]*ServerReplica{ - 0: {}, - 1: {}, - }, - }, - }, - }, - modelKey: "model", - version: 1, - serverKey: "server", - replicas: []*ServerReplica{ - {replicaIdx: 0}, - }, - expectedStates: map[int]ReplicaStatus{0: {State: Available}, 1: {State: Unloaded}}, - expectedModelState: &ModelStatus{State: ModelAvailable}, - }, + //{ + // name: "ModelVersionNotLatest", + // models: map[string]*Model{"model": { + // versions: []*ModelVersion{ + // { + // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + // server: "server", + // version: 1, + // replicas: map[int]ReplicaStatus{}, + // }, + // { + // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + // server: "server", + // version: 2, + // replicas: map[int]ReplicaStatus{}, + // }, + // }, + // }}, + // servers: map[string]*Server{ + // "server": { + // name: "server", + // }, + // }, + // modelKey: "model", + // version: 1, + // serverKey: "server", + // replicas: nil, + // err: true, + //}, + //{ + // name: "UpdatedVersionsOK", + // models: map[string]*Model{"model": { + // versions: []*ModelVersion{ + // { + // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + // server: "server", + // version: 1, + // replicas: map[int]ReplicaStatus{}, + // }, + // }, + // }}, + // servers: map[string]*Server{ + // "server": { + // name: "server", + // replicas: map[int]*ServerReplica{ + // 0: {}, + // 1: {}, + // }, + // }, + // }, + // modelKey: "model", + // version: 1, + // serverKey: "server", + // replicas: []*ServerReplica{ + // {replicaIdx: 0}, {replicaIdx: 1}, + // }, + // expectedStates: map[int]ReplicaStatus{0: {State: LoadRequested}, 1: {State: LoadRequested}}, + //}, + //{ + // name: "WithAlreadyLoadedModels", + // models: map[string]*Model{"model": { + // versions: []*ModelVersion{ + // { + // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + // server: "server", + // version: 1, + // replicas: map[int]ReplicaStatus{ + // 0: {State: Loaded}, + // }, + // }, + // }, + // }}, + // servers: map[string]*Server{ + // "server": { + // name: "server", + // replicas: map[int]*ServerReplica{ + // 0: {}, + // 1: {}, + // }, + // }, + // }, + // modelKey: "model", + // version: 1, + // serverKey: "server", + // replicas: []*ServerReplica{ + // {replicaIdx: 0}, {replicaIdx: 1}, + // }, + // expectedStates: map[int]ReplicaStatus{0: {State: Loaded}, 1: {State: LoadRequested}}, + //}, + //{ + // name: "UnloadModelsNotSelected", + // models: map[string]*Model{"model": { + // versions: []*ModelVersion{ + // { + // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + // server: "server", + // version: 1, + // replicas: map[int]ReplicaStatus{ + // 0: {State: Loaded}, + // }, + // }, + // }, + // }}, + // servers: map[string]*Server{ + // "server": { + // name: "server", + // replicas: map[int]*ServerReplica{ + // 0: {}, + // 1: {}, + // }, + // }, + // }, + // modelKey: "model", + // version: 1, + // serverKey: "server", + // replicas: []*ServerReplica{ + // {replicaIdx: 1}, + // }, + // expectedStates: map[int]ReplicaStatus{0: {State: UnloadEnvoyRequested}, 1: {State: LoadRequested}}, + //}, + //{ + // name: "DeletedModel", + // models: map[string]*Model{"model": { + // versions: []*ModelVersion{ + // { + // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + // server: "server", + // version: 1, + // replicas: map[int]ReplicaStatus{ + // 0: {State: Loaded}, + // 1: {State: Loading}, + // }, + // }, + // }, + // }}, + // servers: map[string]*Server{ + // "server": { + // name: "server", + // replicas: map[int]*ServerReplica{ + // 0: {}, + // 1: {}, + // }, + // }, + // }, + // modelKey: "model", + // version: 1, + // serverKey: "server", + // replicas: []*ServerReplica{}, + // isModelDeleted: true, + // expectedStates: map[int]ReplicaStatus{0: {State: UnloadEnvoyRequested}, 1: {State: UnloadEnvoyRequested}}, + //}, + //{ + // name: "DeletedModelNoReplicas", + // models: map[string]*Model{"model": { + // versions: []*ModelVersion{ + // { + // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + // server: "server", + // version: 1, + // replicas: map[int]ReplicaStatus{ + // 0: {State: Unloaded}, + // }, + // }, + // }, + // }}, + // servers: map[string]*Server{ + // "server": { + // name: "server", + // replicas: map[int]*ServerReplica{ + // 0: {}, + // 1: {}, + // }, + // }, + // }, + // modelKey: "model", + // version: 1, + // serverKey: "server", + // replicas: []*ServerReplica{}, + // isModelDeleted: true, + // expectedStates: map[int]ReplicaStatus{0: {State: Unloaded}}, + //}, + //{ + // name: "ServerChanged", + // models: map[string]*Model{"model": { + // versions: []*ModelVersion{ + // { + // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + // server: "server1", + // version: 1, + // replicas: map[int]ReplicaStatus{}, + // }, + // }, + // }}, + // servers: map[string]*Server{ + // "server1": { + // name: "server1", + // replicas: map[int]*ServerReplica{ + // 0: {}, + // 1: {}, + // }, + // }, + // "server2": { + // name: "server2", + // replicas: map[int]*ServerReplica{ + // 0: {}, + // 1: {}, + // }, + // }, + // }, + // modelKey: "model", + // version: 1, + // serverKey: "server2", + // replicas: []*ServerReplica{ + // {replicaIdx: 0}, {replicaIdx: 1}, + // }, + // expectedStates: map[int]ReplicaStatus{0: {State: LoadRequested}, 1: {State: LoadRequested}}, + //}, + //{ + // name: "WithDrainingServerReplicaSameServer", + // models: map[string]*Model{"model": { + // versions: []*ModelVersion{ + // { + // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + // server: "server", + // version: 1, + // replicas: map[int]ReplicaStatus{ + // 0: {State: Draining}, + // }, + // }, + // }, + // }}, + // servers: map[string]*Server{ + // "server": { + // name: "server", + // replicas: map[int]*ServerReplica{ + // 0: {isDraining: true}, + // 1: {}, + // }, + // }, + // }, + // modelKey: "model", + // version: 1, + // serverKey: "server", + // replicas: []*ServerReplica{ + // {replicaIdx: 1}, + // }, + // expectedStates: map[int]ReplicaStatus{0: {State: Draining}, 1: {State: LoadRequested}}, + //}, + //{ + // name: "WithDrainingServerReplicaNewServer", + // models: map[string]*Model{"model": { + // versions: []*ModelVersion{ + // { + // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + // server: "server1", + // version: 1, + // replicas: map[int]ReplicaStatus{ + // 0: {State: Draining}, + // }, + // }, + // }, + // }}, + // servers: map[string]*Server{ + // "server1": { + // name: "server1", + // replicas: map[int]*ServerReplica{ + // 0: {isDraining: true}, + // }, + // }, + // "server2": { + // name: "server2", + // replicas: map[int]*ServerReplica{ + // 0: {}, + // }, + // }, + // }, + // modelKey: "model", + // version: 1, + // serverKey: "server2", + // replicas: []*ServerReplica{ + // {replicaIdx: 0}, + // }, + // expectedStates: map[int]ReplicaStatus{0: {State: LoadRequested}}, + //}, + //{ + // name: "DeleteFailedScheduleModel", + // models: map[string]*Model{"model": { + // versions: []*ModelVersion{ + // { + // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + // server: "", + // version: 1, + // replicas: map[int]ReplicaStatus{}, + // }, + // }, + // }}, + // servers: map[string]*Server{}, + // modelKey: "model", + // version: 1, + // serverKey: "", + // replicas: []*ServerReplica{}, + // isModelDeleted: true, + // expectedStates: map[int]ReplicaStatus{}, + //}, + //{ + // name: "ProgressModelLoading", + // models: map[string]*Model{"model": { + // versions: []*ModelVersion{ + // { + // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 1}}, + // server: "server", + // version: 1, + // replicas: map[int]ReplicaStatus{ + // 0: {State: Available}, + // 1: {State: Unloaded}, + // }, + // state: ModelStatus{State: ModelProgressing}, + // }, + // }, + // }}, + // servers: map[string]*Server{ + // "server": { + // name: "server", + // replicas: map[int]*ServerReplica{ + // 0: {}, + // 1: {}, + // }, + // }, + // }, + // modelKey: "model", + // version: 1, + // serverKey: "server", + // replicas: []*ServerReplica{ + // {replicaIdx: 0}, + // }, + // expectedStates: map[int]ReplicaStatus{0: {State: Available}, 1: {State: Unloaded}}, + // expectedModelState: &ModelStatus{State: ModelAvailable}, + //}, { name: "PartiallyAvailableModels", - store: &LocalSchedulerStore{ - models: map[string]*Model{"model": { - versions: []*ModelVersion{ + models: []*db.Model{ + { + Name: "model", + Versions: []*db.ModelVersion{ { - modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 3, MinReplicas: 2}}, - server: "server", - version: 1, - replicas: map[int]ReplicaStatus{ - 0: {State: Available}, - 1: {State: Available}, + ModelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 3, MinReplicas: 2}}, + Server: "server", + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Available}, + 1: {State: db.ModelReplicaState_Available}, }, - state: ModelStatus{State: ModelProgressing}, + State: &db.ModelStatus{State: db.ModelState_ModelProgressing}, }, }, }}, - servers: map[string]*Server{ - "server": { - name: "server", - replicas: map[int]*ServerReplica{ - 0: {}, - 1: {}, - }, + servers: []*db.Server{ + { + Name: "server", + Replicas: map[int32]*db.ServerReplica{ + 0: {}, + 1: {}, }, }, }, modelKey: "model", version: 1, serverKey: "server", - replicas: []*ServerReplica{ - {replicaIdx: 0}, {replicaIdx: 1}, + replicas: []*db.ServerReplica{ + {ReplicaIdx: 0}, {ReplicaIdx: 1}, }, - expectedStates: map[int]ReplicaStatus{0: {State: Available}, 1: {State: Available}}, - expectedModelState: &ModelStatus{State: ModelAvailable}, + expectedStates: map[int]db.ModelReplicaState{0: db.ModelReplicaState_Available, 1: db.ModelReplicaState_Available}, + expectedModelState: &db.ModelStatus{State: db.ModelState_ModelAvailable}, }, } @@ -1039,36 +1004,59 @@ func TestUpdateLoadedModels(t *testing.T) { logger := log.New() eventHub, err := coordinator.NewEventHub(logger) g.Expect(err).To(BeNil()) - if test.isModelDeleted { - test.store.models[test.modelKey].SetDeleted() + + // Create storage instances + modelStorage := NewInMemoryStorage[*db.Model]() + serverStorage := NewInMemoryStorage[*db.Server]() + + // Populate storage with test data + for _, model := range test.models { + if test.isModelDeleted { + model.Deleted = true + } + err := modelStorage.Insert(ctx, model) + g.Expect(err).To(BeNil()) } - ms := NewMemoryStore(logger, test.store, eventHub) + for _, server := range test.servers { + err := serverStorage.Insert(ctx, server) + g.Expect(err).To(BeNil()) + } + + ms := NewModelServerStore(logger, modelStorage, serverStorage, eventHub) msg, err := ms.updateLoadedModelsImpl(test.modelKey, test.version, test.serverKey, test.replicas) if !test.err { g.Expect(err).To(BeNil()) g.Expect(msg).ToNot(BeNil()) - mv := test.store.models[test.modelKey].Latest() + + model, err := ms.GetModel(test.modelKey) + g.Expect(err).To(BeNil()) + + mv := model.Latest() + g.Expect(mv).ToNot(BeNil()) + for replicaIdx, state := range test.expectedStates { g.Expect(mv).ToNot(BeNil()) - g.Expect(mv.GetModelReplicaState(replicaIdx)).To(Equal(state.State)) - ss, _ := ms.GetServer(test.serverKey, false, true) - if state.State == LoadRequested { - g.Expect(ss.Replicas[replicaIdx].GetReservedMemory()).To(Equal(memBytes)) + g.Expect(mv.GetModelReplicaState(replicaIdx)).To(Equal(state)) + ss, _, err := ms.GetServer(test.serverKey, false) + g.Expect(err).To(BeNil()) + + if state == db.ModelReplicaState_LoadRequested { + g.Expect(ss.Replicas[int32(replicaIdx)].GetReservedMemory()).To(Equal(memBytes)) } else { - g.Expect(ss.Replicas[replicaIdx].GetReservedMemory()).To(Equal(uint64(0))) + g.Expect(ss.Replicas[int32(replicaIdx)].GetReservedMemory()).To(Equal(uint64(0))) } } if test.expectedModelState != nil { - g.Expect(mv.state.State).To(Equal(test.expectedModelState.State)) + g.Expect(mv.State).To(Equal(test.expectedModelState.State)) } - } else { - g.Expect(err).ToNot(BeNil()) - g.Expect(msg).To(BeNil()) + return } + + g.Expect(err).ToNot(BeNil()) + g.Expect(msg).To(BeNil()) }) } } -*/ // TODO: This test requires refactoring for the new storage architecture. // It tests internal implementation details (UpdateModelState) that work directly @@ -1397,7 +1385,7 @@ func TestUpdateModelState(t *testing.T) { }, ) - ms := NewMemoryStore(logger, test.store, eventHub) + ms := NewModelServerStore(logger, test.store, eventHub) err = ms.UpdateModelState(test.modelKey, test.version, test.serverKey, test.replicaIdx, &test.availableMemory, test.expectedState, test.desiredState, "", test.modelRuntimeInfo) if !test.err { g.Expect(err).To(BeNil()) @@ -1712,7 +1700,7 @@ func TestUpdateModelStatus(t *testing.T) { logger := log.New() eventHub, err := coordinator.NewEventHub(logger) g.Expect(err).To(BeNil()) - ms := NewMemoryStore(logger, &LocalSchedulerStore{}, eventHub) + ms := NewModelServerStore(logger, &LocalSchedulerStore{}, eventHub) ms.updateModelStatus(true, test.deleted, test.modelVersion, test.prevAvailableModelVersion) g.Expect(test.modelVersion.state.State).To(Equal(test.expectedState)) g.Expect(test.modelVersion.state.Reason).To(Equal(test.expectedReason)) @@ -1875,7 +1863,7 @@ func TestAddModelVersionIfNotExists(t *testing.T) { logger := log.New() eventHub, err := coordinator.NewEventHub(logger) g.Expect(err).To(BeNil()) - ms := NewMemoryStore(logger, test.store, eventHub) + ms := NewModelServerStore(logger, test.store, eventHub) ms.addModelVersionIfNotExists(test.modelVersion) modelName := test.modelVersion.GetModel().GetMeta().GetName() g.Expect(test.store.models[modelName].GetVersions()).To(Equal(test.expected)) @@ -2016,7 +2004,7 @@ func TestAddServerReplica(t *testing.T) { } // Create MemoryStore with populated storage - ms := NewMemoryStore(logger, modelStorage, serverStorage, eventHub) + ms := NewModelServerStore(logger, modelStorage, serverStorage, eventHub) // register a callback to check if the event is triggered serverEvents := int64(0) @@ -2274,7 +2262,7 @@ func TestRemoveServerReplica(t *testing.T) { } // Create MemoryStore with populated storage - ms := NewMemoryStore(logger, modelStorage, serverStorage, eventHub) + ms := NewModelServerStore(logger, modelStorage, serverStorage, eventHub) models, err := ms.RemoveServerReplica(test.serverName, test.replicaIdx) g.Expect(err).To(BeNil()) @@ -2420,7 +2408,7 @@ func TestDrainServerReplica(t *testing.T) { } // Create MemoryStore with populated storage - ms := NewMemoryStore(logger, modelStorage, serverStorage, eventHub) + ms := NewModelServerStore(logger, modelStorage, serverStorage, eventHub) models, err := ms.DrainServerReplica(test.serverName, test.replicaIdx) g.Expect(err).To(BeNil()) diff --git a/scheduler/pkg/store/test_memory_hack.go b/scheduler/pkg/store/test_memory_hack.go index 403b245721..f75dbaae3c 100644 --- a/scheduler/pkg/store/test_memory_hack.go +++ b/scheduler/pkg/store/test_memory_hack.go @@ -43,7 +43,7 @@ func NewTestMemory( if t == nil { panic("testing.T is required, must only be run via tests") } - m := NewMemoryStore(logger, modelStore, serverStore, eventHub) + m := NewModelServerStore(logger, modelStore, serverStore, eventHub) return &TestMemoryStore{m} } From d90619c1004ab51ae21b1594724d0340cb4ea73f Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Fri, 9 Jan 2026 15:06:53 +0000 Subject: [PATCH 26/50] more test fixes --- scheduler/pkg/store/memory_test.go | 859 ++++++++++++++++------------- 1 file changed, 473 insertions(+), 386 deletions(-) diff --git a/scheduler/pkg/store/memory_test.go b/scheduler/pkg/store/memory_test.go index 6d366bc1d4..88f0f631ae 100644 --- a/scheduler/pkg/store/memory_test.go +++ b/scheduler/pkg/store/memory_test.go @@ -612,7 +612,7 @@ func TestUpdateLoadedModels(t *testing.T) { name string models []*db.Model servers []*db.Server - modelKey string + modelName string version uint32 serverKey string replicas []*db.ServerReplica @@ -623,354 +623,440 @@ func TestUpdateLoadedModels(t *testing.T) { } tests := []test{ - //{ - // name: "ModelVersionNotLatest", - // models: map[string]*Model{"model": { - // versions: []*ModelVersion{ - // { - // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - // server: "server", - // version: 1, - // replicas: map[int]ReplicaStatus{}, - // }, - // { - // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - // server: "server", - // version: 2, - // replicas: map[int]ReplicaStatus{}, - // }, - // }, - // }}, - // servers: map[string]*Server{ - // "server": { - // name: "server", - // }, - // }, - // modelKey: "model", - // version: 1, - // serverKey: "server", - // replicas: nil, - // err: true, - //}, - //{ - // name: "UpdatedVersionsOK", - // models: map[string]*Model{"model": { - // versions: []*ModelVersion{ - // { - // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - // server: "server", - // version: 1, - // replicas: map[int]ReplicaStatus{}, - // }, - // }, - // }}, - // servers: map[string]*Server{ - // "server": { - // name: "server", - // replicas: map[int]*ServerReplica{ - // 0: {}, - // 1: {}, - // }, - // }, - // }, - // modelKey: "model", - // version: 1, - // serverKey: "server", - // replicas: []*ServerReplica{ - // {replicaIdx: 0}, {replicaIdx: 1}, - // }, - // expectedStates: map[int]ReplicaStatus{0: {State: LoadRequested}, 1: {State: LoadRequested}}, - //}, - //{ - // name: "WithAlreadyLoadedModels", - // models: map[string]*Model{"model": { - // versions: []*ModelVersion{ - // { - // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - // server: "server", - // version: 1, - // replicas: map[int]ReplicaStatus{ - // 0: {State: Loaded}, - // }, - // }, - // }, - // }}, - // servers: map[string]*Server{ - // "server": { - // name: "server", - // replicas: map[int]*ServerReplica{ - // 0: {}, - // 1: {}, - // }, - // }, - // }, - // modelKey: "model", - // version: 1, - // serverKey: "server", - // replicas: []*ServerReplica{ - // {replicaIdx: 0}, {replicaIdx: 1}, - // }, - // expectedStates: map[int]ReplicaStatus{0: {State: Loaded}, 1: {State: LoadRequested}}, - //}, - //{ - // name: "UnloadModelsNotSelected", - // models: map[string]*Model{"model": { - // versions: []*ModelVersion{ - // { - // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - // server: "server", - // version: 1, - // replicas: map[int]ReplicaStatus{ - // 0: {State: Loaded}, - // }, - // }, - // }, - // }}, - // servers: map[string]*Server{ - // "server": { - // name: "server", - // replicas: map[int]*ServerReplica{ - // 0: {}, - // 1: {}, - // }, - // }, - // }, - // modelKey: "model", - // version: 1, - // serverKey: "server", - // replicas: []*ServerReplica{ - // {replicaIdx: 1}, - // }, - // expectedStates: map[int]ReplicaStatus{0: {State: UnloadEnvoyRequested}, 1: {State: LoadRequested}}, - //}, - //{ - // name: "DeletedModel", - // models: map[string]*Model{"model": { - // versions: []*ModelVersion{ - // { - // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - // server: "server", - // version: 1, - // replicas: map[int]ReplicaStatus{ - // 0: {State: Loaded}, - // 1: {State: Loading}, - // }, - // }, - // }, - // }}, - // servers: map[string]*Server{ - // "server": { - // name: "server", - // replicas: map[int]*ServerReplica{ - // 0: {}, - // 1: {}, - // }, - // }, - // }, - // modelKey: "model", - // version: 1, - // serverKey: "server", - // replicas: []*ServerReplica{}, - // isModelDeleted: true, - // expectedStates: map[int]ReplicaStatus{0: {State: UnloadEnvoyRequested}, 1: {State: UnloadEnvoyRequested}}, - //}, - //{ - // name: "DeletedModelNoReplicas", - // models: map[string]*Model{"model": { - // versions: []*ModelVersion{ - // { - // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - // server: "server", - // version: 1, - // replicas: map[int]ReplicaStatus{ - // 0: {State: Unloaded}, - // }, - // }, - // }, - // }}, - // servers: map[string]*Server{ - // "server": { - // name: "server", - // replicas: map[int]*ServerReplica{ - // 0: {}, - // 1: {}, - // }, - // }, - // }, - // modelKey: "model", - // version: 1, - // serverKey: "server", - // replicas: []*ServerReplica{}, - // isModelDeleted: true, - // expectedStates: map[int]ReplicaStatus{0: {State: Unloaded}}, - //}, - //{ - // name: "ServerChanged", - // models: map[string]*Model{"model": { - // versions: []*ModelVersion{ - // { - // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - // server: "server1", - // version: 1, - // replicas: map[int]ReplicaStatus{}, - // }, - // }, - // }}, - // servers: map[string]*Server{ - // "server1": { - // name: "server1", - // replicas: map[int]*ServerReplica{ - // 0: {}, - // 1: {}, - // }, - // }, - // "server2": { - // name: "server2", - // replicas: map[int]*ServerReplica{ - // 0: {}, - // 1: {}, - // }, - // }, - // }, - // modelKey: "model", - // version: 1, - // serverKey: "server2", - // replicas: []*ServerReplica{ - // {replicaIdx: 0}, {replicaIdx: 1}, - // }, - // expectedStates: map[int]ReplicaStatus{0: {State: LoadRequested}, 1: {State: LoadRequested}}, - //}, - //{ - // name: "WithDrainingServerReplicaSameServer", - // models: map[string]*Model{"model": { - // versions: []*ModelVersion{ - // { - // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - // server: "server", - // version: 1, - // replicas: map[int]ReplicaStatus{ - // 0: {State: Draining}, - // }, - // }, - // }, - // }}, - // servers: map[string]*Server{ - // "server": { - // name: "server", - // replicas: map[int]*ServerReplica{ - // 0: {isDraining: true}, - // 1: {}, - // }, - // }, - // }, - // modelKey: "model", - // version: 1, - // serverKey: "server", - // replicas: []*ServerReplica{ - // {replicaIdx: 1}, - // }, - // expectedStates: map[int]ReplicaStatus{0: {State: Draining}, 1: {State: LoadRequested}}, - //}, - //{ - // name: "WithDrainingServerReplicaNewServer", - // models: map[string]*Model{"model": { - // versions: []*ModelVersion{ - // { - // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - // server: "server1", - // version: 1, - // replicas: map[int]ReplicaStatus{ - // 0: {State: Draining}, - // }, - // }, - // }, - // }}, - // servers: map[string]*Server{ - // "server1": { - // name: "server1", - // replicas: map[int]*ServerReplica{ - // 0: {isDraining: true}, - // }, - // }, - // "server2": { - // name: "server2", - // replicas: map[int]*ServerReplica{ - // 0: {}, - // }, - // }, - // }, - // modelKey: "model", - // version: 1, - // serverKey: "server2", - // replicas: []*ServerReplica{ - // {replicaIdx: 0}, - // }, - // expectedStates: map[int]ReplicaStatus{0: {State: LoadRequested}}, - //}, - //{ - // name: "DeleteFailedScheduleModel", - // models: map[string]*Model{"model": { - // versions: []*ModelVersion{ - // { - // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - // server: "", - // version: 1, - // replicas: map[int]ReplicaStatus{}, - // }, - // }, - // }}, - // servers: map[string]*Server{}, - // modelKey: "model", - // version: 1, - // serverKey: "", - // replicas: []*ServerReplica{}, - // isModelDeleted: true, - // expectedStates: map[int]ReplicaStatus{}, - //}, - //{ - // name: "ProgressModelLoading", - // models: map[string]*Model{"model": { - // versions: []*ModelVersion{ - // { - // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 1}}, - // server: "server", - // version: 1, - // replicas: map[int]ReplicaStatus{ - // 0: {State: Available}, - // 1: {State: Unloaded}, - // }, - // state: ModelStatus{State: ModelProgressing}, - // }, - // }, - // }}, - // servers: map[string]*Server{ - // "server": { - // name: "server", - // replicas: map[int]*ServerReplica{ - // 0: {}, - // 1: {}, - // }, - // }, - // }, - // modelKey: "model", - // version: 1, - // serverKey: "server", - // replicas: []*ServerReplica{ - // {replicaIdx: 0}, - // }, - // expectedStates: map[int]ReplicaStatus{0: {State: Available}, 1: {State: Unloaded}}, - // expectedModelState: &ModelStatus{State: ModelAvailable}, - //}, + { + name: "ModelVersionNotLatest", + models: []*db.Model{{ + Name: "model", + Versions: []*db.ModelVersion{ + { + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "model", + }, + ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + Server: "server", + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{}, + State: &db.ModelStatus{}, + }, + { + ModelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + Server: "server", + Version: 2, + Replicas: map[int32]*db.ReplicaStatus{}, + State: &db.ModelStatus{}, + }, + }, + }}, + servers: []*db.Server{ + { + Name: "server", + }, + }, + modelName: "model", + version: 1, + serverKey: "server", + replicas: nil, + err: true, + }, + { + name: "UpdatedVersionsOK", + models: []*db.Model{{ + Name: "model", + Versions: []*db.ModelVersion{ + { + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "model", + }, + ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + Server: "server", + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{}, + State: &db.ModelStatus{}, + }, + }, + }}, + servers: []*db.Server{ + { + Name: "server", + Replicas: map[int32]*db.ServerReplica{ + 0: {}, + 1: {}, + }, + }, + }, + modelName: "model", + version: 1, + serverKey: "server", + replicas: []*db.ServerReplica{ + {ReplicaIdx: 0}, {ReplicaIdx: 1}, + }, + expectedStates: map[int]db.ModelReplicaState{0: db.ModelReplicaState_LoadRequested, 1: db.ModelReplicaState_LoadRequested}, + }, + { + name: "WithAlreadyLoadedModels", + models: []*db.Model{{ + Name: "model", + Versions: []*db.ModelVersion{ + { + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "model", + }, + ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + Server: "server", + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Loaded}, + }, + State: &db.ModelStatus{}, + }, + }, + }}, + servers: []*db.Server{ + { + Name: "server", + Replicas: map[int32]*db.ServerReplica{ + 0: {}, + 1: {}, + }, + }, + }, + modelName: "model", + version: 1, + serverKey: "server", + replicas: []*db.ServerReplica{ + {ReplicaIdx: 0}, {ReplicaIdx: 1}, + }, + expectedStates: map[int]db.ModelReplicaState{0: db.ModelReplicaState_Loaded, 1: db.ModelReplicaState_LoadRequested}, + }, + { + name: "UnloadModelsNotSelected", + models: []*db.Model{{ + Name: "model", + Versions: []*db.ModelVersion{ + { + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "model", + }, + ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + Server: "server", + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Loaded}, + }, + State: &db.ModelStatus{}, + }, + }, + }}, + servers: []*db.Server{ + { + Name: "server", + Replicas: map[int32]*db.ServerReplica{ + 0: {}, + 1: {}, + }, + }, + }, + modelName: "model", + version: 1, + serverKey: "server", + replicas: []*db.ServerReplica{ + {ReplicaIdx: 1}, + }, + expectedStates: map[int]db.ModelReplicaState{0: db.ModelReplicaState_UnloadEnvoyRequested, 1: db.ModelReplicaState_LoadRequested}, + }, + { + name: "DeletedModel", + models: []*db.Model{{ + Name: "model", + Versions: []*db.ModelVersion{ + { + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "model", + }, + ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + Server: "server", + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Loaded}, + 1: {State: db.ModelReplicaState_Loading}, + }, + State: &db.ModelStatus{}, + }, + }, + }}, + servers: []*db.Server{ + { + Name: "server", + Replicas: map[int32]*db.ServerReplica{ + 0: {}, + 1: {}, + }, + }, + }, + modelName: "model", + version: 1, + serverKey: "server", + replicas: []*db.ServerReplica{}, + isModelDeleted: true, + expectedStates: map[int]db.ModelReplicaState{0: db.ModelReplicaState_UnloadEnvoyRequested, 1: db.ModelReplicaState_UnloadEnvoyRequested}, + }, + { + name: "DeletedModelNoReplicas", + models: []*db.Model{{ + Name: "model", + Versions: []*db.ModelVersion{ + { + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "model", + }, + ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + Server: "server", + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Unloaded}, + }, + State: &db.ModelStatus{}, + }, + }, + }}, + servers: []*db.Server{ + { + Name: "server", + Replicas: map[int32]*db.ServerReplica{ + 0: {}, + 1: {}, + }, + }, + }, + modelName: "model", + version: 1, + serverKey: "server", + replicas: []*db.ServerReplica{}, + isModelDeleted: true, + expectedStates: map[int]db.ModelReplicaState{0: db.ModelReplicaState_Unloaded}, + }, + { + name: "ServerChanged", + models: []*db.Model{{ + Name: "model", + Versions: []*db.ModelVersion{ + { + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "model", + }, + ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + Server: "server1", + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{}, + State: &db.ModelStatus{}, + }, + }, + }}, + servers: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: {}, + 1: {}, + }, + }, + { + Name: "server2", + Replicas: map[int32]*db.ServerReplica{ + 0: {}, + 1: {}, + }, + }, + }, + modelName: "model", + version: 1, + serverKey: "server2", + replicas: []*db.ServerReplica{ + {ReplicaIdx: 0}, {ReplicaIdx: 1}, + }, + expectedStates: map[int]db.ModelReplicaState{0: db.ModelReplicaState_LoadRequested, 1: db.ModelReplicaState_LoadRequested}, + }, + { + name: "WithDrainingServerReplicaSameServer", + models: []*db.Model{{ + Name: "model", + Versions: []*db.ModelVersion{ + { + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "model", + }, + ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + Server: "server", + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Draining}, + }, + State: &db.ModelStatus{}, + }, + }, + }}, + servers: []*db.Server{ + { + Name: "server", + Replicas: map[int32]*db.ServerReplica{ + 0: {IsDraining: true}, + 1: {}, + }, + }, + }, + modelName: "model", + version: 1, + serverKey: "server", + replicas: []*db.ServerReplica{ + {ReplicaIdx: 1}, + }, + expectedStates: map[int]db.ModelReplicaState{0: db.ModelReplicaState_Draining, 1: db.ModelReplicaState_LoadRequested}, + }, + { + name: "WithDrainingServerReplicaNewServer", + models: []*db.Model{{ + Name: "model", + Versions: []*db.ModelVersion{ + { + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "model", + }, + ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}, + }, + Server: "server1", + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Draining}, + }, + State: &db.ModelStatus{}, + }, + }, + }}, + servers: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: {IsDraining: true}, + }, + }, + { + Name: "server2", + Replicas: map[int32]*db.ServerReplica{ + 0: {}, + }, + }, + }, + modelName: "model", + version: 1, + serverKey: "server2", + replicas: []*db.ServerReplica{ + {ReplicaIdx: 0}, + }, + expectedStates: map[int]db.ModelReplicaState{0: db.ModelReplicaState_LoadRequested}, + }, + { + name: "DeleteFailedScheduleModel", + models: []*db.Model{{ + Name: "model", + Versions: []*db.ModelVersion{ + { + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "model", + }, + ModelSpec: &pb.ModelSpec{ + MemoryBytes: &memBytes, + }}, + Server: "", + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{}, + }, + }, + }}, + servers: []*db.Server{}, + modelName: "model", + version: 1, + serverKey: "", + replicas: []*db.ServerReplica{}, + isModelDeleted: true, + expectedStates: map[int]db.ModelReplicaState{}, + }, + { + name: "ProgressModelLoading", + models: []*db.Model{ + { + Name: "my-model", + Versions: []*db.ModelVersion{ + { + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "my-model", + }, + ModelSpec: &pb.ModelSpec{ + MemoryBytes: &memBytes, + }, + DeploymentSpec: &pb.DeploymentSpec{ + Replicas: 1, + }, + }, + Server: "server", + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Available}, + 1: {State: db.ModelReplicaState_Unloaded}, + }, + State: &db.ModelStatus{State: db.ModelState_ModelProgressing}, + }, + }, + }}, + servers: []*db.Server{ + { + Name: "server", + Replicas: map[int32]*db.ServerReplica{ + 0: {}, + 1: {}, + }, + }, + }, + modelName: "my-model", + version: 1, + serverKey: "server", + replicas: []*db.ServerReplica{ + {ReplicaIdx: 0}, {ReplicaIdx: 0}, + }, + expectedStates: map[int]db.ModelReplicaState{0: db.ModelReplicaState_Available, 1: db.ModelReplicaState_Unloaded}, + expectedModelState: &db.ModelStatus{State: db.ModelState_ModelAvailable}, + }, { name: "PartiallyAvailableModels", models: []*db.Model{ { - Name: "model", + Name: "my-model", Versions: []*db.ModelVersion{ { - ModelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 3, MinReplicas: 2}}, - Server: "server", - Version: 1, + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "my-model", + }, + ModelSpec: &pb.ModelSpec{ + MemoryBytes: &memBytes, + }, + DeploymentSpec: &pb.DeploymentSpec{ + Replicas: 3, + MinReplicas: 2, + }, + }, + Server: "server", + Version: 1, Replicas: map[int32]*db.ReplicaStatus{ 0: {State: db.ModelReplicaState_Available}, 1: {State: db.ModelReplicaState_Available}, @@ -988,7 +1074,7 @@ func TestUpdateLoadedModels(t *testing.T) { }, }, }, - modelKey: "model", + modelName: "my-model", version: 1, serverKey: "server", replicas: []*db.ServerReplica{ @@ -1023,37 +1109,38 @@ func TestUpdateLoadedModels(t *testing.T) { } ms := NewModelServerStore(logger, modelStorage, serverStorage, eventHub) - msg, err := ms.updateLoadedModelsImpl(test.modelKey, test.version, test.serverKey, test.replicas) - if !test.err { - g.Expect(err).To(BeNil()) - g.Expect(msg).ToNot(BeNil()) + msg, err := ms.updateLoadedModelsImpl(test.modelName, test.version, test.serverKey, test.replicas) + if test.err { + g.Expect(err).ToNot(BeNil()) + g.Expect(msg).To(BeNil()) + return + } - model, err := ms.GetModel(test.modelKey) - g.Expect(err).To(BeNil()) + g.Expect(err).To(BeNil()) + g.Expect(msg).ToNot(BeNil()) + g.Expect(msg.ModelName).To(Equal(test.modelName)) - mv := model.Latest() - g.Expect(mv).ToNot(BeNil()) + model, err := ms.GetModel(test.modelName) + g.Expect(err).To(BeNil()) - for replicaIdx, state := range test.expectedStates { - g.Expect(mv).ToNot(BeNil()) - g.Expect(mv.GetModelReplicaState(replicaIdx)).To(Equal(state)) - ss, _, err := ms.GetServer(test.serverKey, false) - g.Expect(err).To(BeNil()) + mv := model.Latest() + g.Expect(mv).ToNot(BeNil()) - if state == db.ModelReplicaState_LoadRequested { - g.Expect(ss.Replicas[int32(replicaIdx)].GetReservedMemory()).To(Equal(memBytes)) - } else { - g.Expect(ss.Replicas[int32(replicaIdx)].GetReservedMemory()).To(Equal(uint64(0))) - } - } - if test.expectedModelState != nil { - g.Expect(mv.State).To(Equal(test.expectedModelState.State)) + for replicaIdx, state := range test.expectedStates { + g.Expect(mv).ToNot(BeNil()) + g.Expect(mv.GetModelReplicaState(replicaIdx)).To(Equal(state)) + ss, _, err := ms.GetServer(test.serverKey, false) + g.Expect(err).To(BeNil()) + + if state == db.ModelReplicaState_LoadRequested { + g.Expect(ss.Replicas[int32(replicaIdx)].GetReservedMemory()).To(Equal(memBytes)) + continue } - return + g.Expect(ss.Replicas[int32(replicaIdx)].GetReservedMemory()).To(Equal(uint64(0))) + } + if test.expectedModelState != nil { + g.Expect(mv.State.State).To(Equal(test.expectedModelState.State)) } - - g.Expect(err).ToNot(BeNil()) - g.Expect(msg).To(BeNil()) }) } } @@ -1070,7 +1157,7 @@ func TestUpdateModelState(t *testing.T) { type test struct { name string store *LocalSchedulerStore - modelKey string + modelName string version uint32 serverKey string replicaIdx int @@ -1107,7 +1194,7 @@ func TestUpdateModelState(t *testing.T) { }, }, }, - modelKey: "model", + modelName: "model", version: 1, serverKey: "server", replicaIdx: 0, @@ -1140,7 +1227,7 @@ func TestUpdateModelState(t *testing.T) { }, }, }, - modelKey: "model", + modelName: "model", version: 1, serverKey: "server", replicaIdx: 0, @@ -1175,7 +1262,7 @@ func TestUpdateModelState(t *testing.T) { }, }, }, - modelKey: "model", + modelName: "model", version: 1, serverKey: "server", replicaIdx: 0, @@ -1208,7 +1295,7 @@ func TestUpdateModelState(t *testing.T) { }, }, }, - modelKey: "model", + modelName: "model", version: 1, serverKey: "server", replicaIdx: 0, @@ -1251,7 +1338,7 @@ func TestUpdateModelState(t *testing.T) { }, }, }, - modelKey: "foo", + modelName: "foo", version: 1, serverKey: "server", replicaIdx: 0, @@ -1294,7 +1381,7 @@ func TestUpdateModelState(t *testing.T) { }, }, }, - modelKey: "foo", + modelName: "foo", version: 1, serverKey: "server", replicaIdx: 0, @@ -1328,7 +1415,7 @@ func TestUpdateModelState(t *testing.T) { }, }, }, - modelKey: "model", + modelName: "model", version: 1, serverKey: "server", replicaIdx: 0, @@ -1346,11 +1433,11 @@ func TestUpdateModelState(t *testing.T) { eventHub, err := coordinator.NewEventHub(logger) g.Expect(err).To(BeNil()) if test.deleted { - test.store.models[test.modelKey].SetDeleted() + test.store.models[test.modelName].SetDeleted() } var expectedModelRuntimeInfo *pb.ModelRuntimeInfo - if test.store.models[test.modelKey].GetVersion(test.version).modelDefn.ModelSpec.ModelRuntimeInfo != nil { - expectedModelRuntimeInfo = test.store.models[test.modelKey].GetVersion(test.version).modelDefn.ModelSpec.ModelRuntimeInfo + if test.store.models[test.modelName].GetVersion(test.version).modelDefn.ModelSpec.ModelRuntimeInfo != nil { + expectedModelRuntimeInfo = test.store.models[test.modelName].GetVersion(test.version).modelDefn.ModelSpec.ModelRuntimeInfo } else { expectedModelRuntimeInfo = test.modelRuntimeInfo } @@ -1386,19 +1473,19 @@ func TestUpdateModelState(t *testing.T) { ) ms := NewModelServerStore(logger, test.store, eventHub) - err = ms.UpdateModelState(test.modelKey, test.version, test.serverKey, test.replicaIdx, &test.availableMemory, test.expectedState, test.desiredState, "", test.modelRuntimeInfo) + err = ms.UpdateModelState(test.modelName, test.version, test.serverKey, test.replicaIdx, &test.availableMemory, test.expectedState, test.desiredState, "", test.modelRuntimeInfo) if !test.err { g.Expect(err).To(BeNil()) if !test.deleted { - g.Expect(test.store.models[test.modelKey].GetVersion(test.version).GetModelReplicaState(test.replicaIdx)).To(Equal(test.desiredState)) - g.Expect(test.store.servers[test.serverKey].replicas[test.replicaIdx].loadedModels[ModelVersionID{Name: test.modelKey, Version: test.version}]).To(Equal(test.modelVersionLoaded)) + g.Expect(test.store.models[test.modelName].GetVersion(test.version).GetModelReplicaState(test.replicaIdx)).To(Equal(test.desiredState)) + g.Expect(test.store.servers[test.serverKey].replicas[test.replicaIdx].loadedModels[ModelVersionID{Name: test.modelName, Version: test.version}]).To(Equal(test.modelVersionLoaded)) g.Expect(test.store.servers[test.serverKey].replicas[test.replicaIdx].GetNumLoadedModels()).To(Equal(test.numModelVersionsLoaded)) } else { - g.Expect(test.store.models[test.modelKey].Latest().state.State).To(Equal(ModelTerminated)) + g.Expect(test.store.models[test.modelName].Latest().state.State).To(Equal(ModelTerminated)) } if expectedModelRuntimeInfo != nil { - g.Expect(test.store.models[test.modelKey].GetVersion(test.version).modelDefn.ModelSpec.ModelRuntimeInfo).To(Equal(expectedModelRuntimeInfo)) + g.Expect(test.store.models[test.modelName].GetVersion(test.version).modelDefn.ModelSpec.ModelRuntimeInfo).To(Equal(expectedModelRuntimeInfo)) } } else { g.Expect(err).ToNot(BeNil()) @@ -1406,7 +1493,7 @@ func TestUpdateModelState(t *testing.T) { if test.desiredState == Loaded || test.desiredState == LoadFailed { g.Expect(test.store.servers[test.serverKey].replicas[test.replicaIdx].GetReservedMemory()).To(Equal(uint64(0))) } else { - g.Expect(test.store.servers[test.serverKey].replicas[test.replicaIdx].GetReservedMemory()).To(Equal(test.store.models[test.modelKey].GetVersion(test.version).GetRequiredMemory())) + g.Expect(test.store.servers[test.serverKey].replicas[test.replicaIdx].GetReservedMemory()).To(Equal(test.store.models[test.modelName].GetVersion(test.version).GetRequiredMemory())) } uniqueLoadedModels := toUniqueModels(test.store.servers[test.serverKey].replicas[test.replicaIdx].loadedModels) From 41d078f28c52559fd280c09d3726036b5db9e170 Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Fri, 9 Jan 2026 15:45:06 +0000 Subject: [PATCH 27/50] wip fix test --- scheduler/pkg/store/memory_test.go | 564 +++++++++++++++-------------- 1 file changed, 300 insertions(+), 264 deletions(-) diff --git a/scheduler/pkg/store/memory_test.go b/scheduler/pkg/store/memory_test.go index 88f0f631ae..db00d2d088 100644 --- a/scheduler/pkg/store/memory_test.go +++ b/scheduler/pkg/store/memory_test.go @@ -12,7 +12,9 @@ package store import ( "context" "errors" + "sync" "testing" + "time" . "github.com/onsi/gomega" log "github.com/sirupsen/logrus" @@ -1145,24 +1147,20 @@ func TestUpdateLoadedModels(t *testing.T) { } } -// TODO: This test requires refactoring for the new storage architecture. -// It tests internal implementation details (UpdateModelState) that work directly -// with LocalSchedulerStore. The new architecture uses Storage interfaces and may need -// a different approach to testing these internal methods. -/* func TestUpdateModelState(t *testing.T) { g := NewGomegaWithT(t) memBytes := uint64(1) type test struct { name string - store *LocalSchedulerStore - modelName string + modelName string + models []*db.Model + servers []*db.Server version uint32 serverKey string replicaIdx int - expectedState ModelReplicaState - desiredState ModelReplicaState + expectedState db.ModelReplicaState + desiredState db.ModelReplicaState availableMemory uint64 modelRuntimeInfo *pb.ModelRuntimeInfo numModelVersionsLoaded int @@ -1172,255 +1170,263 @@ func TestUpdateModelState(t *testing.T) { } tests := []test{ + //{ + // name: "LoadedModel", + // store: &LocalSchedulerStore{ + // models: map[string]*Model{"model": { + // versions: []*ModelVersion{ + // { + // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + // version: 1, + // replicas: map[int]ReplicaStatus{}, + // }, + // }, + // }}, + // servers: map[string]*Server{ + // "server": { + // name: "server", + // replicas: map[int]*ServerReplica{ + // 0: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, + // 1: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, + // }, + // }, + // }, + // }, + // modelName: "model", + // version: 1, + // serverKey: "server", + // replicaIdx: 0, + // expectedState: ModelReplicaStateUnknown, + // desiredState: Loaded, + // numModelVersionsLoaded: 1, + // modelVersionLoaded: true, + // availableMemory: 20, + // modelRuntimeInfo: &pb.ModelRuntimeInfo{ModelRuntimeInfo: &pb.ModelRuntimeInfo_Mlserver{Mlserver: &pb.MLServerModelSettings{ParallelWorkers: uint32(1)}}}, + //}, + //{ + // name: "UnloadedModel", + // store: &LocalSchedulerStore{ + // models: map[string]*Model{"model": { + // versions: []*ModelVersion{ + // { + // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + // version: 1, + // replicas: map[int]ReplicaStatus{}, + // }, + // }, + // }}, + // servers: map[string]*Server{ + // "server": { + // name: "server", + // replicas: map[int]*ServerReplica{ + // 0: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, + // 1: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, + // }, + // }, + // }, + // }, + // modelName: "model", + // version: 1, + // serverKey: "server", + // replicaIdx: 0, + // expectedState: ModelReplicaStateUnknown, + // desiredState: Unloaded, + // numModelVersionsLoaded: 0, + // modelVersionLoaded: false, + // availableMemory: 20, + // modelRuntimeInfo: &pb.ModelRuntimeInfo{ModelRuntimeInfo: &pb.ModelRuntimeInfo_Mlserver{Mlserver: &pb.MLServerModelSettings{ParallelWorkers: uint32(1)}}}, + //}, + //{ + // name: "Unloaded model but not matching expected state", + // store: &LocalSchedulerStore{ + // models: map[string]*Model{"model": { + // versions: []*ModelVersion{ + // { + // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + // version: 1, + // replicas: map[int]ReplicaStatus{ + // 0: {State: LoadRequested}, + // }, + // }, + // }, + // }}, + // servers: map[string]*Server{ + // "server": { + // name: "server", + // replicas: map[int]*ServerReplica{ + // 0: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, + // 1: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, + // }, + // }, + // }, + // }, + // modelName: "model", + // version: 1, + // serverKey: "server", + // replicaIdx: 0, + // expectedState: Unloading, + // desiredState: Unloaded, + // numModelVersionsLoaded: 0, + // modelVersionLoaded: false, + // availableMemory: 20, + // err: true, + //}, + //{ + // name: "DeletedModel", + // store: &LocalSchedulerStore{ + // models: map[string]*Model{"model": { + // versions: []*ModelVersion{ + // { + // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + // version: 1, + // replicas: map[int]ReplicaStatus{}, + // }, + // }, + // }}, + // servers: map[string]*Server{ + // "server": { + // name: "server", + // replicas: map[int]*ServerReplica{ + // 0: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, + // 1: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, + // }, + // }, + // }, + // }, + // modelName: "model", + // version: 1, + // serverKey: "server", + // replicaIdx: 0, + // expectedState: ModelReplicaStateUnknown, + // desiredState: Unloaded, + // numModelVersionsLoaded: 0, + // modelVersionLoaded: false, + // availableMemory: 20, + // deleted: true, + // modelRuntimeInfo: &pb.ModelRuntimeInfo{ModelRuntimeInfo: &pb.ModelRuntimeInfo_Mlserver{Mlserver: &pb.MLServerModelSettings{ParallelWorkers: uint32(1)}}}, + //}, + //{ + // name: "Model updated but not latest on replica which is loaded", + // store: &LocalSchedulerStore{ + // models: map[string]*Model{"foo": { + // versions: []*ModelVersion{ + // { + // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + // version: 1, + // replicas: map[int]ReplicaStatus{ + // 0: {State: Unloading}, + // }, + // }, + // { + // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + // version: 2, + // replicas: map[int]ReplicaStatus{ + // 0: {State: Loaded}, + // }, + // }, + // }, + // }}, + // servers: map[string]*Server{ + // "server": { + // name: "server", + // replicas: map[int]*ServerReplica{ + // 0: {loadedModels: map[ModelVersionID]bool{{Name: "foo", Version: 2}: true, {Name: "foo", Version: 1}: true}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{"foo": true}}, + // 1: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, + // }, + // }, + // }, + // }, + // modelName: "foo", + // version: 1, + // serverKey: "server", + // replicaIdx: 0, + // expectedState: Unloading, + // desiredState: Unloaded, + // numModelVersionsLoaded: 1, + // modelVersionLoaded: false, + // availableMemory: 20, + // modelRuntimeInfo: &pb.ModelRuntimeInfo{ModelRuntimeInfo: &pb.ModelRuntimeInfo_Mlserver{Mlserver: &pb.MLServerModelSettings{ParallelWorkers: uint32(1)}}}, + // err: false, + //}, + //{ + // name: "Model updated but not latest on replica which is Available", + // store: &LocalSchedulerStore{ + // models: map[string]*Model{"foo": { + // versions: []*ModelVersion{ + // { + // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + // version: 1, + // replicas: map[int]ReplicaStatus{ + // 0: {State: Unloading}, + // }, + // }, + // { + // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + // version: 2, + // replicas: map[int]ReplicaStatus{ + // 0: {State: Available}, + // }, + // }, + // }, + // }}, + // servers: map[string]*Server{ + // "server": { + // name: "server", + // replicas: map[int]*ServerReplica{ + // 0: {loadedModels: map[ModelVersionID]bool{{Name: "foo", Version: 2}: true, {Name: "foo", Version: 1}: true}, reservedMemory: memBytes * 2, uniqueLoadedModels: map[string]bool{"foo": true}}, + // 1: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, + // }, + // }, + // }, + // }, + // modelName: "foo", + // version: 1, + // serverKey: "server", + // replicaIdx: 0, + // expectedState: Unloading, + // desiredState: Unloaded, + // numModelVersionsLoaded: 1, + // modelVersionLoaded: false, + // availableMemory: 20, + // modelRuntimeInfo: &pb.ModelRuntimeInfo{ModelRuntimeInfo: &pb.ModelRuntimeInfo_Mlserver{Mlserver: &pb.MLServerModelSettings{ParallelWorkers: uint32(2)}}}, + // err: false, + //}, { - name: "LoadedModel", - store: &LocalSchedulerStore{ - models: map[string]*Model{"model": { - versions: []*ModelVersion{ - { - modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - version: 1, - replicas: map[int]ReplicaStatus{}, - }, - }, - }}, - servers: map[string]*Server{ - "server": { - name: "server", - replicas: map[int]*ServerReplica{ - 0: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, - 1: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, - }, - }, - }, - }, - modelName: "model", - version: 1, - serverKey: "server", - replicaIdx: 0, - expectedState: ModelReplicaStateUnknown, - desiredState: Loaded, - numModelVersionsLoaded: 1, - modelVersionLoaded: true, - availableMemory: 20, - modelRuntimeInfo: &pb.ModelRuntimeInfo{ModelRuntimeInfo: &pb.ModelRuntimeInfo_Mlserver{Mlserver: &pb.MLServerModelSettings{ParallelWorkers: uint32(1)}}}, - }, - { - name: "UnloadedModel", - store: &LocalSchedulerStore{ - models: map[string]*Model{"model": { - versions: []*ModelVersion{ - { - modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - version: 1, - replicas: map[int]ReplicaStatus{}, - }, - }, - }}, - servers: map[string]*Server{ - "server": { - name: "server", - replicas: map[int]*ServerReplica{ - 0: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, - 1: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, - }, - }, - }, - }, - modelName: "model", - version: 1, - serverKey: "server", - replicaIdx: 0, - expectedState: ModelReplicaStateUnknown, - desiredState: Unloaded, - numModelVersionsLoaded: 0, - modelVersionLoaded: false, - availableMemory: 20, - modelRuntimeInfo: &pb.ModelRuntimeInfo{ModelRuntimeInfo: &pb.ModelRuntimeInfo_Mlserver{Mlserver: &pb.MLServerModelSettings{ParallelWorkers: uint32(1)}}}, - }, - { - name: "Unloaded model but not matching expected state", - store: &LocalSchedulerStore{ - models: map[string]*Model{"model": { - versions: []*ModelVersion{ - { - modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - version: 1, - replicas: map[int]ReplicaStatus{ - 0: {State: LoadRequested}, - }, - }, - }, - }}, - servers: map[string]*Server{ - "server": { - name: "server", - replicas: map[int]*ServerReplica{ - 0: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, - 1: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, - }, - }, - }, - }, - modelName: "model", - version: 1, - serverKey: "server", - replicaIdx: 0, - expectedState: Unloading, - desiredState: Unloaded, - numModelVersionsLoaded: 0, - modelVersionLoaded: false, - availableMemory: 20, - err: true, - }, - { - name: "DeletedModel", - store: &LocalSchedulerStore{ - models: map[string]*Model{"model": { - versions: []*ModelVersion{ - { - modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - version: 1, - replicas: map[int]ReplicaStatus{}, - }, - }, - }}, - servers: map[string]*Server{ - "server": { - name: "server", - replicas: map[int]*ServerReplica{ - 0: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, - 1: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, - }, - }, - }, - }, - modelName: "model", - version: 1, - serverKey: "server", - replicaIdx: 0, - expectedState: ModelReplicaStateUnknown, - desiredState: Unloaded, - numModelVersionsLoaded: 0, - modelVersionLoaded: false, - availableMemory: 20, - deleted: true, - modelRuntimeInfo: &pb.ModelRuntimeInfo{ModelRuntimeInfo: &pb.ModelRuntimeInfo_Mlserver{Mlserver: &pb.MLServerModelSettings{ParallelWorkers: uint32(1)}}}, - }, - { - name: "Model updated but not latest on replica which is loaded", - store: &LocalSchedulerStore{ - models: map[string]*Model{"foo": { - versions: []*ModelVersion{ - { - modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - version: 1, - replicas: map[int]ReplicaStatus{ - 0: {State: Unloading}, - }, - }, - { - modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - version: 2, - replicas: map[int]ReplicaStatus{ - 0: {State: Loaded}, - }, - }, - }, - }}, - servers: map[string]*Server{ - "server": { - name: "server", - replicas: map[int]*ServerReplica{ - 0: {loadedModels: map[ModelVersionID]bool{{Name: "foo", Version: 2}: true, {Name: "foo", Version: 1}: true}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{"foo": true}}, - 1: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, - }, - }, - }, - }, - modelName: "foo", - version: 1, - serverKey: "server", - replicaIdx: 0, - expectedState: Unloading, - desiredState: Unloaded, - numModelVersionsLoaded: 1, - modelVersionLoaded: false, - availableMemory: 20, - modelRuntimeInfo: &pb.ModelRuntimeInfo{ModelRuntimeInfo: &pb.ModelRuntimeInfo_Mlserver{Mlserver: &pb.MLServerModelSettings{ParallelWorkers: uint32(1)}}}, - err: false, - }, - { - name: "Model updated but not latest on replica which is Available", - store: &LocalSchedulerStore{ - models: map[string]*Model{"foo": { - versions: []*ModelVersion{ - { - modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - version: 1, - replicas: map[int]ReplicaStatus{ - 0: {State: Unloading}, - }, - }, - { - modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - version: 2, - replicas: map[int]ReplicaStatus{ - 0: {State: Available}, + name: "Existing ModelRuntimeInfo is not overwritten", + models: []*db.Model{{ + Name: "my-model", + Versions: []*db.ModelVersion{ + { + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "my-model", }, - }, - }, - }}, - servers: map[string]*Server{ - "server": { - name: "server", - replicas: map[int]*ServerReplica{ - 0: {loadedModels: map[ModelVersionID]bool{{Name: "foo", Version: 2}: true, {Name: "foo", Version: 1}: true}, reservedMemory: memBytes * 2, uniqueLoadedModels: map[string]bool{"foo": true}}, - 1: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, - }, + ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes, + ModelRuntimeInfo: &pb.ModelRuntimeInfo{ + ModelRuntimeInfo: &pb.ModelRuntimeInfo_Mlserver{ + Mlserver: &pb.MLServerModelSettings{ + ParallelWorkers: uint32(2)}}}}}, + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{}, + State: &db.ModelStatus{}, }, }, - }, - modelName: "foo", - version: 1, - serverKey: "server", - replicaIdx: 0, - expectedState: Unloading, - desiredState: Unloaded, - numModelVersionsLoaded: 1, - modelVersionLoaded: false, - availableMemory: 20, - modelRuntimeInfo: &pb.ModelRuntimeInfo{ModelRuntimeInfo: &pb.ModelRuntimeInfo_Mlserver{Mlserver: &pb.MLServerModelSettings{ParallelWorkers: uint32(2)}}}, - err: false, - }, - { - name: "Existing ModelRuntimeInfo is not overwritten", - store: &LocalSchedulerStore{ - models: map[string]*Model{"model": { - versions: []*ModelVersion{ - { - modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes, ModelRuntimeInfo: &pb.ModelRuntimeInfo{ModelRuntimeInfo: &pb.ModelRuntimeInfo_Mlserver{Mlserver: &pb.MLServerModelSettings{ParallelWorkers: uint32(2)}}}}}, - version: 1, - replicas: map[int]ReplicaStatus{}, - }, - }, - }}, - servers: map[string]*Server{ - "server": { - name: "server", - replicas: map[int]*ServerReplica{ - 0: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, - 1: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, - }, + }}, + servers: []*db.Server{ + { + Name: "server", + Replicas: map[int32]*db.ServerReplica{ + 0: {LoadedModels: []*db.ModelVersionID{}, ReservedMemory: memBytes, UniqueLoadedModels: map[string]bool{}}, + 1: {LoadedModels: []*db.ModelVersionID{}, ReservedMemory: memBytes, UniqueLoadedModels: map[string]bool{}}, }, }, }, - modelName: "model", + modelName: "my.model", version: 1, serverKey: "server", replicaIdx: 0, - expectedState: ModelReplicaStateUnknown, - desiredState: Loaded, + expectedState: db.ModelReplicaState_ModelReplicaStateUnknown, + desiredState: db.ModelReplicaState_Loaded, numModelVersionsLoaded: 1, modelVersionLoaded: true, availableMemory: 20, @@ -1430,14 +1436,33 @@ func TestUpdateModelState(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { logger := log.New() + ctx := context.Background() eventHub, err := coordinator.NewEventHub(logger) g.Expect(err).To(BeNil()) - if test.deleted { - test.store.models[test.modelName].SetDeleted() + + // Create storage instances + modelStorage := NewInMemoryStorage[*db.Model]() + serverStorage := NewInMemoryStorage[*db.Server]() + + // Populate storage with test data + for _, model := range test.models { + if test.deleted { + model.Deleted = true + } + err := modelStorage.Insert(ctx, model) + g.Expect(err).To(BeNil()) } + for _, server := range test.servers { + err := serverStorage.Insert(ctx, server) + g.Expect(err).To(BeNil()) + } + var expectedModelRuntimeInfo *pb.ModelRuntimeInfo - if test.store.models[test.modelName].GetVersion(test.version).modelDefn.ModelSpec.ModelRuntimeInfo != nil { - expectedModelRuntimeInfo = test.store.models[test.modelName].GetVersion(test.version).modelDefn.ModelSpec.ModelRuntimeInfo + model, err := modelStorage.Get(ctx, test.modelName) + g.Expect(err).To(BeNil()) + + if model.GetVersion(test.version).ModelDefn.ModelSpec.ModelRuntimeInfo != nil { + expectedModelRuntimeInfo = model.GetVersion(test.version).ModelDefn.ModelSpec.ModelRuntimeInfo } else { expectedModelRuntimeInfo = test.modelRuntimeInfo } @@ -1472,32 +1497,44 @@ func TestUpdateModelState(t *testing.T) { }, ) - ms := NewModelServerStore(logger, test.store, eventHub) + getModel := func(name string) *db.Model { + model, err := modelStorage.Get(ctx, name) + g.Expect(err).To(BeNil()) + return model + } + + getServer := func(name string) *db.Server { + server, err := serverStorage.Get(ctx, name) + g.Expect(err).To(BeNil()) + return server + } + + ms := NewModelServerStore(logger, modelStorage, serverStorage, eventHub) err = ms.UpdateModelState(test.modelName, test.version, test.serverKey, test.replicaIdx, &test.availableMemory, test.expectedState, test.desiredState, "", test.modelRuntimeInfo) if !test.err { g.Expect(err).To(BeNil()) if !test.deleted { - g.Expect(test.store.models[test.modelName].GetVersion(test.version).GetModelReplicaState(test.replicaIdx)).To(Equal(test.desiredState)) - g.Expect(test.store.servers[test.serverKey].replicas[test.replicaIdx].loadedModels[ModelVersionID{Name: test.modelName, Version: test.version}]).To(Equal(test.modelVersionLoaded)) - g.Expect(test.store.servers[test.serverKey].replicas[test.replicaIdx].GetNumLoadedModels()).To(Equal(test.numModelVersionsLoaded)) + g.Expect(getModel(test.modelName).GetVersion(test.version).GetModelReplicaState(test.replicaIdx)).To(Equal(test.desiredState)) + g.Expect(getServer(test.serverKey).Replicas[int32(test.replicaIdx)].LoadedModels[ModelVersionID{Name: test.modelName, Version: test.version}]).To(Equal(test.modelVersionLoaded)) + g.Expect(getServer(test.serverKey).Replicas[int32(test.replicaIdx)].GetNumLoadedModels()).To(Equal(test.numModelVersionsLoaded)) } else { - g.Expect(test.store.models[test.modelName].Latest().state.State).To(Equal(ModelTerminated)) + g.Expect(getModel(test.modelName).Latest().State.State).To(Equal(db.ModelState_ModelTerminated)) } if expectedModelRuntimeInfo != nil { - g.Expect(test.store.models[test.modelName].GetVersion(test.version).modelDefn.ModelSpec.ModelRuntimeInfo).To(Equal(expectedModelRuntimeInfo)) + g.Expect(getModel(test.modelName).GetVersion(test.version).ModelDefn.ModelSpec.ModelRuntimeInfo).To(Equal(expectedModelRuntimeInfo)) } } else { g.Expect(err).ToNot(BeNil()) } - if test.desiredState == Loaded || test.desiredState == LoadFailed { - g.Expect(test.store.servers[test.serverKey].replicas[test.replicaIdx].GetReservedMemory()).To(Equal(uint64(0))) + if test.desiredState == db.ModelReplicaState_Loaded || test.desiredState == db.ModelReplicaState_LoadFailed { + g.Expect(getServer(test.serverKey).Replicas[int32(test.replicaIdx)].GetReservedMemory()).To(Equal(uint64(0))) } else { - g.Expect(test.store.servers[test.serverKey].replicas[test.replicaIdx].GetReservedMemory()).To(Equal(test.store.models[test.modelName].GetVersion(test.version).GetRequiredMemory())) + g.Expect(getServer(test.serverKey).Replicas[int32(test.replicaIdx)].GetReservedMemory()).To(Equal(getModel(test.modelName).GetVersion(test.version).GetRequiredMemory())) } - uniqueLoadedModels := toUniqueModels(test.store.servers[test.serverKey].replicas[test.replicaIdx].loadedModels) - g.Expect(uniqueLoadedModels).To(Equal(test.store.servers[test.serverKey].replicas[test.replicaIdx].uniqueLoadedModels)) + uniqueLoadedModels := toUniqueModels(getServer(test.serverKey).Replicas[int32(test.replicaIdx)].LoadedModels) + g.Expect(uniqueLoadedModels).To(Equal(getServer(test.serverKey).Replicas[int32(test.replicaIdx)].UniqueLoadedModels)) // allow events to propagate time.Sleep(500 * time.Millisecond) @@ -1518,7 +1555,6 @@ func TestUpdateModelState(t *testing.T) { }) } } -*/ // TODO: This test requires refactoring for the new storage architecture. // It tests internal implementation details (updateModelStatus) that work directly From 2ec851bd585d8e79d6b07418211c10abb1e9b99d Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Fri, 9 Jan 2026 17:11:55 +0000 Subject: [PATCH 28/50] wip: fixing test --- apis/go/mlops/scheduler/db/server_ext.go | 4 +- scheduler/pkg/store/memory_test.go | 512 +++++++++++++---------- 2 files changed, 288 insertions(+), 228 deletions(-) diff --git a/apis/go/mlops/scheduler/db/server_ext.go b/apis/go/mlops/scheduler/db/server_ext.go index a2ebf6d2be..d0d40486bd 100644 --- a/apis/go/mlops/scheduler/db/server_ext.go +++ b/apis/go/mlops/scheduler/db/server_ext.go @@ -15,7 +15,7 @@ import ( func (s *Server) initReplicas() { if s.Replicas == nil { - s.Replicas = map[int32]*ServerReplica{} + s.Replicas = make(map[int32]*ServerReplica) } } @@ -49,7 +49,7 @@ func (s *ServerReplica) GetNumLoadedModels() int { func (s *ServerReplica) initUniqueLoadedModels() { if s.UniqueLoadedModels == nil { - s.UniqueLoadedModels = map[string]bool{} + s.UniqueLoadedModels = make(map[string]bool) } } diff --git a/scheduler/pkg/store/memory_test.go b/scheduler/pkg/store/memory_test.go index db00d2d088..284ebb4f61 100644 --- a/scheduler/pkg/store/memory_test.go +++ b/scheduler/pkg/store/memory_test.go @@ -1170,227 +1170,261 @@ func TestUpdateModelState(t *testing.T) { } tests := []test{ - //{ - // name: "LoadedModel", - // store: &LocalSchedulerStore{ - // models: map[string]*Model{"model": { - // versions: []*ModelVersion{ - // { - // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - // version: 1, - // replicas: map[int]ReplicaStatus{}, - // }, - // }, - // }}, - // servers: map[string]*Server{ - // "server": { - // name: "server", - // replicas: map[int]*ServerReplica{ - // 0: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, - // 1: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, - // }, - // }, - // }, - // }, - // modelName: "model", - // version: 1, - // serverKey: "server", - // replicaIdx: 0, - // expectedState: ModelReplicaStateUnknown, - // desiredState: Loaded, - // numModelVersionsLoaded: 1, - // modelVersionLoaded: true, - // availableMemory: 20, - // modelRuntimeInfo: &pb.ModelRuntimeInfo{ModelRuntimeInfo: &pb.ModelRuntimeInfo_Mlserver{Mlserver: &pb.MLServerModelSettings{ParallelWorkers: uint32(1)}}}, - //}, - //{ - // name: "UnloadedModel", - // store: &LocalSchedulerStore{ - // models: map[string]*Model{"model": { - // versions: []*ModelVersion{ - // { - // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - // version: 1, - // replicas: map[int]ReplicaStatus{}, - // }, - // }, - // }}, - // servers: map[string]*Server{ - // "server": { - // name: "server", - // replicas: map[int]*ServerReplica{ - // 0: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, - // 1: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, - // }, - // }, - // }, - // }, - // modelName: "model", - // version: 1, - // serverKey: "server", - // replicaIdx: 0, - // expectedState: ModelReplicaStateUnknown, - // desiredState: Unloaded, - // numModelVersionsLoaded: 0, - // modelVersionLoaded: false, - // availableMemory: 20, - // modelRuntimeInfo: &pb.ModelRuntimeInfo{ModelRuntimeInfo: &pb.ModelRuntimeInfo_Mlserver{Mlserver: &pb.MLServerModelSettings{ParallelWorkers: uint32(1)}}}, - //}, - //{ - // name: "Unloaded model but not matching expected state", - // store: &LocalSchedulerStore{ - // models: map[string]*Model{"model": { - // versions: []*ModelVersion{ - // { - // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - // version: 1, - // replicas: map[int]ReplicaStatus{ - // 0: {State: LoadRequested}, - // }, - // }, - // }, - // }}, - // servers: map[string]*Server{ - // "server": { - // name: "server", - // replicas: map[int]*ServerReplica{ - // 0: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, - // 1: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, - // }, - // }, - // }, - // }, - // modelName: "model", - // version: 1, - // serverKey: "server", - // replicaIdx: 0, - // expectedState: Unloading, - // desiredState: Unloaded, - // numModelVersionsLoaded: 0, - // modelVersionLoaded: false, - // availableMemory: 20, - // err: true, - //}, - //{ - // name: "DeletedModel", - // store: &LocalSchedulerStore{ - // models: map[string]*Model{"model": { - // versions: []*ModelVersion{ - // { - // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - // version: 1, - // replicas: map[int]ReplicaStatus{}, - // }, - // }, - // }}, - // servers: map[string]*Server{ - // "server": { - // name: "server", - // replicas: map[int]*ServerReplica{ - // 0: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, - // 1: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, - // }, - // }, - // }, - // }, - // modelName: "model", - // version: 1, - // serverKey: "server", - // replicaIdx: 0, - // expectedState: ModelReplicaStateUnknown, - // desiredState: Unloaded, - // numModelVersionsLoaded: 0, - // modelVersionLoaded: false, - // availableMemory: 20, - // deleted: true, - // modelRuntimeInfo: &pb.ModelRuntimeInfo{ModelRuntimeInfo: &pb.ModelRuntimeInfo_Mlserver{Mlserver: &pb.MLServerModelSettings{ParallelWorkers: uint32(1)}}}, - //}, - //{ - // name: "Model updated but not latest on replica which is loaded", - // store: &LocalSchedulerStore{ - // models: map[string]*Model{"foo": { - // versions: []*ModelVersion{ - // { - // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - // version: 1, - // replicas: map[int]ReplicaStatus{ - // 0: {State: Unloading}, - // }, - // }, - // { - // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - // version: 2, - // replicas: map[int]ReplicaStatus{ - // 0: {State: Loaded}, - // }, - // }, - // }, - // }}, - // servers: map[string]*Server{ - // "server": { - // name: "server", - // replicas: map[int]*ServerReplica{ - // 0: {loadedModels: map[ModelVersionID]bool{{Name: "foo", Version: 2}: true, {Name: "foo", Version: 1}: true}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{"foo": true}}, - // 1: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, - // }, - // }, - // }, - // }, - // modelName: "foo", - // version: 1, - // serverKey: "server", - // replicaIdx: 0, - // expectedState: Unloading, - // desiredState: Unloaded, - // numModelVersionsLoaded: 1, - // modelVersionLoaded: false, - // availableMemory: 20, - // modelRuntimeInfo: &pb.ModelRuntimeInfo{ModelRuntimeInfo: &pb.ModelRuntimeInfo_Mlserver{Mlserver: &pb.MLServerModelSettings{ParallelWorkers: uint32(1)}}}, - // err: false, - //}, - //{ - // name: "Model updated but not latest on replica which is Available", - // store: &LocalSchedulerStore{ - // models: map[string]*Model{"foo": { - // versions: []*ModelVersion{ - // { - // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - // version: 1, - // replicas: map[int]ReplicaStatus{ - // 0: {State: Unloading}, - // }, - // }, - // { - // modelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, - // version: 2, - // replicas: map[int]ReplicaStatus{ - // 0: {State: Available}, - // }, - // }, - // }, - // }}, - // servers: map[string]*Server{ - // "server": { - // name: "server", - // replicas: map[int]*ServerReplica{ - // 0: {loadedModels: map[ModelVersionID]bool{{Name: "foo", Version: 2}: true, {Name: "foo", Version: 1}: true}, reservedMemory: memBytes * 2, uniqueLoadedModels: map[string]bool{"foo": true}}, - // 1: {loadedModels: map[ModelVersionID]bool{}, reservedMemory: memBytes, uniqueLoadedModels: map[string]bool{}}, - // }, - // }, - // }, - // }, - // modelName: "foo", - // version: 1, - // serverKey: "server", - // replicaIdx: 0, - // expectedState: Unloading, - // desiredState: Unloaded, - // numModelVersionsLoaded: 1, - // modelVersionLoaded: false, - // availableMemory: 20, - // modelRuntimeInfo: &pb.ModelRuntimeInfo{ModelRuntimeInfo: &pb.ModelRuntimeInfo_Mlserver{Mlserver: &pb.MLServerModelSettings{ParallelWorkers: uint32(2)}}}, - // err: false, - //}, + { + name: "LoadedModel", + models: []*db.Model{{ + Name: "model", + Versions: []*db.ModelVersion{ + { + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "model", + }, + ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{}, + State: &db.ModelStatus{}, + }, + }, + }}, + servers: []*db.Server{ + { + Name: "server", + Replicas: map[int32]*db.ServerReplica{ + 0: {LoadedModels: []*db.ModelVersionID{}, ReservedMemory: memBytes, UniqueLoadedModels: map[string]bool{}}, + 1: {LoadedModels: []*db.ModelVersionID{}, ReservedMemory: memBytes, UniqueLoadedModels: map[string]bool{}}, + }, + }, + }, + modelName: "model", + version: 1, + serverKey: "server", + replicaIdx: 0, + expectedState: db.ModelReplicaState_ModelReplicaStateUnknown, + desiredState: db.ModelReplicaState_Loaded, + numModelVersionsLoaded: 1, + modelVersionLoaded: true, + availableMemory: 20, + modelRuntimeInfo: &pb.ModelRuntimeInfo{ModelRuntimeInfo: &pb.ModelRuntimeInfo_Mlserver{Mlserver: &pb.MLServerModelSettings{ParallelWorkers: uint32(1)}}}, + }, + { + name: "UnloadedModel", + models: []*db.Model{{ + Name: "model", + Versions: []*db.ModelVersion{ + { + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "model", + }, + ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{}, + State: &db.ModelStatus{}, + }, + }, + }}, + servers: []*db.Server{ + { + Name: "server", + Replicas: map[int32]*db.ServerReplica{ + 0: {LoadedModels: []*db.ModelVersionID{}, ReservedMemory: memBytes, UniqueLoadedModels: map[string]bool{}}, + 1: {LoadedModels: []*db.ModelVersionID{}, ReservedMemory: memBytes, UniqueLoadedModels: map[string]bool{}}, + }, + }, + }, + modelName: "model", + version: 1, + serverKey: "server", + replicaIdx: 0, + expectedState: db.ModelReplicaState_ModelReplicaStateUnknown, + desiredState: db.ModelReplicaState_Unloaded, + numModelVersionsLoaded: 0, + modelVersionLoaded: false, + availableMemory: 20, + modelRuntimeInfo: &pb.ModelRuntimeInfo{ModelRuntimeInfo: &pb.ModelRuntimeInfo_Mlserver{Mlserver: &pb.MLServerModelSettings{ParallelWorkers: uint32(1)}}}, + }, + { + name: "Unloaded model but not matching expected state", + models: []*db.Model{{ + Name: "model", + Versions: []*db.ModelVersion{ + { + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "model", + }, + ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_LoadRequested}, + }, + State: &db.ModelStatus{}, + }, + }, + }}, + servers: []*db.Server{ + { + Name: "server", + Replicas: map[int32]*db.ServerReplica{ + 0: {LoadedModels: []*db.ModelVersionID{}, ReservedMemory: memBytes, UniqueLoadedModels: map[string]bool{}}, + 1: {LoadedModels: []*db.ModelVersionID{}, ReservedMemory: memBytes, UniqueLoadedModels: map[string]bool{}}, + }, + }, + }, + modelName: "model", + version: 1, + serverKey: "server", + replicaIdx: 0, + expectedState: db.ModelReplicaState_Unloading, + desiredState: db.ModelReplicaState_Unloaded, + numModelVersionsLoaded: 0, + modelVersionLoaded: false, + availableMemory: 20, + err: true, + }, + { + name: "DeletedModel", + models: []*db.Model{{ + Name: "my-model", + Versions: []*db.ModelVersion{ + { + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "my-model", + }, + ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{}, + State: &db.ModelStatus{}, + }, + }, + }}, + servers: []*db.Server{ + { + Name: "server", + Replicas: map[int32]*db.ServerReplica{ + 0: {LoadedModels: []*db.ModelVersionID{}, ReservedMemory: memBytes, UniqueLoadedModels: make(map[string]bool)}, + 1: {LoadedModels: []*db.ModelVersionID{}, ReservedMemory: memBytes, UniqueLoadedModels: make(map[string]bool)}, + }, + }, + }, + modelName: "my-model", + version: 1, + serverKey: "server", + replicaIdx: 0, + expectedState: db.ModelReplicaState_ModelReplicaStateUnknown, + desiredState: db.ModelReplicaState_Unloaded, + numModelVersionsLoaded: 0, + modelVersionLoaded: false, + availableMemory: 20, + deleted: true, + modelRuntimeInfo: &pb.ModelRuntimeInfo{ModelRuntimeInfo: &pb.ModelRuntimeInfo_Mlserver{Mlserver: &pb.MLServerModelSettings{ParallelWorkers: uint32(1)}}}, + }, + { + name: "Model updated but not latest on replica which is loaded", + models: []*db.Model{{ + Name: "foo", + Versions: []*db.ModelVersion{ + { + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "foo", + }, + ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Unloading}, + }, + State: &db.ModelStatus{}, + }, + { + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "foo", + }, + ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + Version: 2, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Loaded}, + }, + State: &db.ModelStatus{}, + }, + }, + }}, + servers: []*db.Server{ + { + Name: "server", + Replicas: map[int32]*db.ServerReplica{ + 0: {LoadedModels: []*db.ModelVersionID{{Name: "foo", Version: 2}, {Name: "foo", Version: 1}}, ReservedMemory: memBytes, UniqueLoadedModels: map[string]bool{"foo": true}}, + 1: {LoadedModels: []*db.ModelVersionID{}, ReservedMemory: memBytes, UniqueLoadedModels: map[string]bool{}}, + }, + }, + }, + modelName: "foo", + version: 1, + serverKey: "server", + replicaIdx: 0, + expectedState: db.ModelReplicaState_Unloading, + desiredState: db.ModelReplicaState_Unloaded, + numModelVersionsLoaded: 1, + modelVersionLoaded: false, + availableMemory: 20, + modelRuntimeInfo: &pb.ModelRuntimeInfo{ModelRuntimeInfo: &pb.ModelRuntimeInfo_Mlserver{Mlserver: &pb.MLServerModelSettings{ParallelWorkers: uint32(1)}}}, + err: false, + }, + { + name: "Model updated but not latest on replica which is Available", + models: []*db.Model{{ + Name: "foo", + Versions: []*db.ModelVersion{ + { + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "foo", + }, + ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Unloading}, + }, + State: &db.ModelStatus{}, + }, + { + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "foo", + }, + ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes}}, + Version: 2, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Available}, + }, + State: &db.ModelStatus{}, + }, + }, + }}, + servers: []*db.Server{ + { + Name: "server", + Replicas: map[int32]*db.ServerReplica{ + 0: {LoadedModels: []*db.ModelVersionID{{Name: "foo", Version: 2}, {Name: "foo", Version: 1}}, ReservedMemory: memBytes * 2, UniqueLoadedModels: map[string]bool{"foo": true}}, + 1: {LoadedModels: []*db.ModelVersionID{{Name: "foo", Version: 2}, {Name: "foo", Version: 1}}, ReservedMemory: memBytes, UniqueLoadedModels: map[string]bool{}}, + }, + }, + }, + modelName: "foo", + version: 1, + serverKey: "server", + replicaIdx: 0, + expectedState: db.ModelReplicaState_Unloading, + desiredState: db.ModelReplicaState_Unloaded, + numModelVersionsLoaded: 1, + modelVersionLoaded: false, + availableMemory: 20, + modelRuntimeInfo: &pb.ModelRuntimeInfo{ModelRuntimeInfo: &pb.ModelRuntimeInfo_Mlserver{Mlserver: &pb.MLServerModelSettings{ParallelWorkers: uint32(2)}}}, + err: false, + }, { name: "Existing ModelRuntimeInfo is not overwritten", models: []*db.Model{{ @@ -1401,11 +1435,17 @@ func TestUpdateModelState(t *testing.T) { Meta: &pb.MetaData{ Name: "my-model", }, - ModelSpec: &pb.ModelSpec{MemoryBytes: &memBytes, + ModelSpec: &pb.ModelSpec{ + MemoryBytes: &memBytes, ModelRuntimeInfo: &pb.ModelRuntimeInfo{ ModelRuntimeInfo: &pb.ModelRuntimeInfo_Mlserver{ Mlserver: &pb.MLServerModelSettings{ - ParallelWorkers: uint32(2)}}}}}, + ParallelWorkers: uint32(2), + }, + }, + }, + }, + }, Version: 1, Replicas: map[int32]*db.ReplicaStatus{}, State: &db.ModelStatus{}, @@ -1421,7 +1461,7 @@ func TestUpdateModelState(t *testing.T) { }, }, }, - modelName: "my.model", + modelName: "my-model", version: 1, serverKey: "server", replicaIdx: 0, @@ -1515,7 +1555,15 @@ func TestUpdateModelState(t *testing.T) { g.Expect(err).To(BeNil()) if !test.deleted { g.Expect(getModel(test.modelName).GetVersion(test.version).GetModelReplicaState(test.replicaIdx)).To(Equal(test.desiredState)) - g.Expect(getServer(test.serverKey).Replicas[int32(test.replicaIdx)].LoadedModels[ModelVersionID{Name: test.modelName, Version: test.version}]).To(Equal(test.modelVersionLoaded)) + + found := false + for _, v := range getServer(test.serverKey).Replicas[int32(test.replicaIdx)].LoadedModels { + if v.Name == test.modelName && v.Version == test.version && test.modelVersionLoaded { + found = true + } + } + g.Expect(found).To(Equal(test.modelVersionLoaded)) + g.Expect(getServer(test.serverKey).Replicas[int32(test.replicaIdx)].GetNumLoadedModels()).To(Equal(test.numModelVersionsLoaded)) } else { g.Expect(getModel(test.modelName).Latest().State.State).To(Equal(db.ModelState_ModelTerminated)) @@ -1527,12 +1575,24 @@ func TestUpdateModelState(t *testing.T) { } else { g.Expect(err).ToNot(BeNil()) } - if test.desiredState == db.ModelReplicaState_Loaded || test.desiredState == db.ModelReplicaState_LoadFailed { + + if test.desiredState == db.ModelReplicaState_LoadFailed { g.Expect(getServer(test.serverKey).Replicas[int32(test.replicaIdx)].GetReservedMemory()).To(Equal(uint64(0))) } else { g.Expect(getServer(test.serverKey).Replicas[int32(test.replicaIdx)].GetReservedMemory()).To(Equal(getModel(test.modelName).GetVersion(test.version).GetRequiredMemory())) } + toUniqueModels := func(loadedModels []*db.ModelVersionID) map[string]bool { + if loadedModels == nil { + return nil + } + uniqueModels := make(map[string]bool) + for _, key := range loadedModels { + uniqueModels[key.Name] = true + } + return uniqueModels + } + uniqueLoadedModels := toUniqueModels(getServer(test.serverKey).Replicas[int32(test.replicaIdx)].LoadedModels) g.Expect(uniqueLoadedModels).To(Equal(getServer(test.serverKey).Replicas[int32(test.replicaIdx)].UniqueLoadedModels)) From 7c54bb1a233d4296c945e142bcc0891cd3d85c2b Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Mon, 12 Jan 2026 09:27:40 +0000 Subject: [PATCH 29/50] wip: fixng test. Additionally fixed bug where reserved memory not updated --- scheduler/pkg/store/memory.go | 14 +++++--------- scheduler/pkg/store/memory_test.go | 19 ++++++++----------- 2 files changed, 13 insertions(+), 20 deletions(-) diff --git a/scheduler/pkg/store/memory.go b/scheduler/pkg/store/memory.go index afaa899ba4..62705d0e4c 100644 --- a/scheduler/pkg/store/memory.go +++ b/scheduler/pkg/store/memory.go @@ -448,7 +448,7 @@ func (m *ModelServerStore) updateLoadedModelsImpl( modelKey, modelVersion.Version, serverKey, replicaIdx, ) modelVersion.SetReplicaState(int(replicaIdx), db.ModelReplicaState_LoadRequested, "") - if err := m.updateReservedMemory(db.ModelReplicaState_LoadRequested, serverKey, + if err := m.updateReservedMemory(db.ModelReplicaState_LoadRequested, server, int(replicaIdx), modelVersion.GetRequiredMemory()); err != nil { return nil, fmt.Errorf("failed to update server %s replica %d: %w", serverKey, replicaIdx, err) } @@ -460,7 +460,7 @@ func (m *ModelServerStore) updateLoadedModelsImpl( ) if !existingState.State.AlreadyLoadingOrLoaded() { modelVersion.SetReplicaState(int(replicaIdx), db.ModelReplicaState_LoadRequested, "") - if err := m.updateReservedMemory(db.ModelReplicaState_LoadRequested, serverKey, + if err := m.updateReservedMemory(db.ModelReplicaState_LoadRequested, server, int(replicaIdx), modelVersion.GetRequiredMemory()); err != nil { return nil, fmt.Errorf("failed to update server %s replica %d: %w", serverKey, replicaIdx, err) } @@ -676,8 +676,8 @@ func (m *ModelServerStore) updateModelStateImpl( ) } - if err := m.updateReservedMemory(desiredState, serverKey, replicaIdx, modelVersion.GetRequiredMemory()); err != nil { - return nil, fmt.Errorf("failed to server %s replica %d: %w", serverKey, replicaIdx, err) + if err := m.updateReservedMemory(desiredState, server, replicaIdx, modelVersion.GetRequiredMemory()); err != nil { + return nil, fmt.Errorf("failed to update server %s replica %d reserved memory: %w", serverKey, replicaIdx, err) } deletedModelReplica := false @@ -758,14 +758,10 @@ func (m *ModelServerStore) updateModelStateImpl( } func (m *ModelServerStore) updateReservedMemory( - modelReplicaState db.ModelReplicaState, serverKey string, replicaIdx int, memBytes uint64, + modelReplicaState db.ModelReplicaState, server *db.Server, replicaIdx int, memBytes uint64, ) error { // update reserved memory that is being used for sorting replicas // do we need to lock replica update? - server, err := m.store.servers.Get(context.TODO(), serverKey) - if err != nil { - return fmt.Errorf("get server %s failed: %w", serverKey, err) - } replica, okReplica := server.Replicas[int32(replicaIdx)] update := false diff --git a/scheduler/pkg/store/memory_test.go b/scheduler/pkg/store/memory_test.go index 284ebb4f61..e14ef8f09e 100644 --- a/scheduler/pkg/store/memory_test.go +++ b/scheduler/pkg/store/memory_test.go @@ -1576,7 +1576,7 @@ func TestUpdateModelState(t *testing.T) { g.Expect(err).ToNot(BeNil()) } - if test.desiredState == db.ModelReplicaState_LoadFailed { + if test.desiredState == db.ModelReplicaState_LoadFailed || test.desiredState == db.ModelReplicaState_Loaded { g.Expect(getServer(test.serverKey).Replicas[int32(test.replicaIdx)].GetReservedMemory()).To(Equal(uint64(0))) } else { g.Expect(getServer(test.serverKey).Replicas[int32(test.replicaIdx)].GetReservedMemory()).To(Equal(getModel(test.modelName).GetVersion(test.version).GetRequiredMemory())) @@ -1616,20 +1616,15 @@ func TestUpdateModelState(t *testing.T) { } } -// TODO: This test requires refactoring for the new storage architecture. -// It tests internal implementation details (updateModelStatus) that work directly -// with LocalSchedulerStore internal types. The new architecture uses Storage interfaces -// and may need a different approach to testing these internal methods. -/* func TestUpdateModelStatus(t *testing.T) { g := NewGomegaWithT(t) type test struct { name string deleted bool - modelVersion *ModelVersion - prevAvailableModelVersion *ModelVersion - expectedState ModelState + modelVersion *db.ModelVersion + prevAvailableModelVersion *db.ModelVersion + expectedState db.ModelState expectedReason string expectedAvailableReplicas uint32 expectedTimestamp time.Time @@ -1883,7 +1878,10 @@ func TestUpdateModelStatus(t *testing.T) { logger := log.New() eventHub, err := coordinator.NewEventHub(logger) g.Expect(err).To(BeNil()) - ms := NewModelServerStore(logger, &LocalSchedulerStore{}, eventHub) + modelStorage := NewInMemoryStorage[*db.Model]() + serverStorage := NewInMemoryStorage[*db.Server]() + + ms := NewModelServerStore(logger, modelStorage, serverStorage, eventHub) ms.updateModelStatus(true, test.deleted, test.modelVersion, test.prevAvailableModelVersion) g.Expect(test.modelVersion.state.State).To(Equal(test.expectedState)) g.Expect(test.modelVersion.state.Reason).To(Equal(test.expectedReason)) @@ -1892,7 +1890,6 @@ func TestUpdateModelStatus(t *testing.T) { }) } } -*/ // TODO: This test requires refactoring for the new storage architecture. // It tests internal implementation details (addModelVersionIfNotExists) that work directly From 3c6808be7443acd86fae57f37ae998fba6aa5968 Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Mon, 12 Jan 2026 11:08:56 +0000 Subject: [PATCH 30/50] wip: fixing test --- scheduler/pkg/store/memory_test.go | 1678 ++++++++++++++-------------- 1 file changed, 848 insertions(+), 830 deletions(-) diff --git a/scheduler/pkg/store/memory_test.go b/scheduler/pkg/store/memory_test.go index e14ef8f09e..b770c13686 100644 --- a/scheduler/pkg/store/memory_test.go +++ b/scheduler/pkg/store/memory_test.go @@ -17,8 +17,10 @@ import ( "time" . "github.com/onsi/gomega" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/agent" log "github.com/sirupsen/logrus" "google.golang.org/protobuf/proto" + "google.golang.org/protobuf/types/known/timestamppb" pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" @@ -1633,21 +1635,22 @@ func TestUpdateModelStatus(t *testing.T) { r1 := "reason1" d2 := time.Date(2021, 1, 2, 12, 0, 0, 0, time.UTC) r2 := "reason2" + tests := []test{ { name: "Available", deleted: false, - modelVersion: NewModelVersion( - &pb.Model{ModelSpec: &pb.ModelSpec{}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 1}}, - 1, - "server", - map[int]ReplicaStatus{ - 0: {State: Available, Reason: "", Timestamp: d1}, - }, - false, - ModelProgressing), + modelVersion: &db.ModelVersion{ + Server: "server", + ModelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 1}}, + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Available, Timestamp: timestamppb.New(d1)}, + }, + State: &db.ModelStatus{State: db.ModelState_ModelProgressing}, + }, prevAvailableModelVersion: nil, - expectedState: ModelAvailable, + expectedState: db.ModelState_ModelAvailable, expectedAvailableReplicas: 1, expectedReason: "", expectedTimestamp: d1, @@ -1655,17 +1658,17 @@ func TestUpdateModelStatus(t *testing.T) { { name: "Scaled Down", deleted: false, - modelVersion: NewModelVersion( - &pb.Model{ModelSpec: &pb.ModelSpec{}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 0}}, - 1, - "server", - map[int]ReplicaStatus{ - 0: {State: Unloaded, Reason: "", Timestamp: d1}, - }, - false, - ModelProgressing), + modelVersion: &db.ModelVersion{ + Server: "server", + ModelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 0}}, + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Unloaded, Timestamp: timestamppb.New(d1)}, + }, + State: &db.ModelStatus{State: db.ModelState_ModelProgressing}, + }, prevAvailableModelVersion: nil, - expectedState: ModelScaledDown, + expectedState: db.ModelState_ModelScaledDown, expectedAvailableReplicas: 0, expectedReason: "", expectedTimestamp: d1, @@ -1673,18 +1676,18 @@ func TestUpdateModelStatus(t *testing.T) { { name: "Progressing", deleted: false, - modelVersion: NewModelVersion( - &pb.Model{ModelSpec: &pb.ModelSpec{}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 2}}, - 1, - "server", - map[int]ReplicaStatus{ - 0: {State: Available, Reason: "", Timestamp: d1}, - 1: {State: Loading, Reason: "", Timestamp: d1}, - }, - false, - ModelProgressing), + modelVersion: &db.ModelVersion{ + Server: "server", + ModelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 2}}, + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Available, Timestamp: timestamppb.New(d1)}, + 1: {State: db.ModelReplicaState_Loading, Timestamp: timestamppb.New(d1)}, + }, + State: &db.ModelStatus{State: db.ModelState_ModelProgressing}, + }, prevAvailableModelVersion: nil, - expectedState: ModelProgressing, + expectedState: db.ModelState_ModelProgressing, expectedAvailableReplicas: 1, expectedReason: "", expectedTimestamp: d1, @@ -1692,17 +1695,17 @@ func TestUpdateModelStatus(t *testing.T) { { name: "Failed", deleted: false, - modelVersion: NewModelVersion( - &pb.Model{ModelSpec: &pb.ModelSpec{}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 1}}, - 1, - "server", - map[int]ReplicaStatus{ - 0: {State: LoadFailed, Reason: r1, Timestamp: d1}, - }, - false, - ModelProgressing), + modelVersion: &db.ModelVersion{ + Server: "server", + ModelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 1}}, + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_LoadFailed, Reason: r1, Timestamp: timestamppb.New(d1)}, + }, + State: &db.ModelStatus{State: db.ModelState_ModelProgressing}, + }, prevAvailableModelVersion: nil, - expectedState: ModelFailed, + expectedState: db.ModelState_ModelFailed, expectedAvailableReplicas: 0, expectedReason: r1, expectedTimestamp: d1, @@ -1710,18 +1713,18 @@ func TestUpdateModelStatus(t *testing.T) { { name: "AvailableAndFailed", deleted: false, - modelVersion: NewModelVersion( - &pb.Model{ModelSpec: &pb.ModelSpec{}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 2}}, - 1, - "server", - map[int]ReplicaStatus{ - 0: {State: Loaded, Reason: "", Timestamp: d1}, - 1: {State: LoadFailed, Reason: r1, Timestamp: d2}, - }, - false, - ModelProgressing), + modelVersion: &db.ModelVersion{ + Server: "server", + ModelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 2}}, + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Loaded, Timestamp: timestamppb.New(d1)}, + 1: {State: db.ModelReplicaState_LoadFailed, Reason: r1, Timestamp: timestamppb.New(d2)}, + }, + State: &db.ModelStatus{State: db.ModelState_ModelProgressing}, + }, prevAvailableModelVersion: nil, - expectedState: ModelFailed, + expectedState: db.ModelState_ModelFailed, expectedAvailableReplicas: 0, expectedReason: r1, expectedTimestamp: d2, @@ -1729,18 +1732,18 @@ func TestUpdateModelStatus(t *testing.T) { { name: "TwoFailed", deleted: false, - modelVersion: NewModelVersion( - &pb.Model{ModelSpec: &pb.ModelSpec{}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 2}}, - 1, - "server", - map[int]ReplicaStatus{ - 0: {State: LoadFailed, Reason: r1, Timestamp: d1}, - 1: {State: LoadFailed, Reason: r2, Timestamp: d2}, - }, - false, - ModelProgressing), + modelVersion: &db.ModelVersion{ + Server: "server", + ModelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 2}}, + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_LoadFailed, Reason: r1, Timestamp: timestamppb.New(d1)}, + 1: {State: db.ModelReplicaState_LoadFailed, Reason: r2, Timestamp: timestamppb.New(d2)}, + }, + State: &db.ModelStatus{State: db.ModelState_ModelProgressing}, + }, prevAvailableModelVersion: nil, - expectedState: ModelFailed, + expectedState: db.ModelState_ModelFailed, expectedAvailableReplicas: 0, expectedReason: r2, expectedTimestamp: d2, @@ -1748,26 +1751,26 @@ func TestUpdateModelStatus(t *testing.T) { { name: "AvailableV2", deleted: false, - modelVersion: NewModelVersion( - &pb.Model{ModelSpec: &pb.ModelSpec{}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 1}}, - 1, - "server", - map[int]ReplicaStatus{ - 0: {State: Loading, Reason: "", Timestamp: d1}, - 1: {State: Available, Reason: "", Timestamp: d2}, - }, - false, - ModelProgressing), - prevAvailableModelVersion: NewModelVersion( - &pb.Model{ModelSpec: &pb.ModelSpec{}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 1}}, - 1, - "server", - map[int]ReplicaStatus{ - 0: {State: Available, Reason: "", Timestamp: d1}, - }, - false, - ModelAvailable), - expectedState: ModelAvailable, + modelVersion: &db.ModelVersion{ + Server: "server", + ModelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 1}}, + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Loading, Timestamp: timestamppb.New(d1)}, + 1: {State: db.ModelReplicaState_Available, Timestamp: timestamppb.New(d2)}, + }, + State: &db.ModelStatus{State: db.ModelState_ModelProgressing}, + }, + prevAvailableModelVersion: &db.ModelVersion{ + Server: "server", + ModelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 1}}, + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Available, Timestamp: timestamppb.New(d1)}, + }, + State: &db.ModelStatus{State: db.ModelState_ModelAvailable}, + }, + expectedState: db.ModelState_ModelAvailable, expectedAvailableReplicas: 1, expectedReason: "", expectedTimestamp: d2, @@ -1775,17 +1778,17 @@ func TestUpdateModelStatus(t *testing.T) { { name: "Terminating", deleted: true, - modelVersion: NewModelVersion( - &pb.Model{ModelSpec: &pb.ModelSpec{}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 2}}, - 1, - "server", - map[int]ReplicaStatus{ - 0: {State: Unloading, Reason: "", Timestamp: d1}, - 1: {State: Unloading, Reason: "", Timestamp: d2}, - }, - true, - ModelProgressing), - expectedState: ModelTerminating, + modelVersion: &db.ModelVersion{ + Server: "server", + ModelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 2}}, + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Unloading, Timestamp: timestamppb.New(d1)}, + 1: {State: db.ModelReplicaState_Unloading, Timestamp: timestamppb.New(d2)}, + }, + State: &db.ModelStatus{State: db.ModelState_ModelProgressing}, + }, + expectedState: db.ModelState_ModelTerminating, expectedAvailableReplicas: 0, expectedReason: "", expectedTimestamp: d2, @@ -1793,17 +1796,17 @@ func TestUpdateModelStatus(t *testing.T) { { name: "TerminatingLoadingReplicas", deleted: true, - modelVersion: NewModelVersion( - &pb.Model{ModelSpec: &pb.ModelSpec{}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 2}}, - 2, - "server", - map[int]ReplicaStatus{ - 0: {State: Loading, Reason: "", Timestamp: d1}, - 1: {State: Loading, Reason: "", Timestamp: d2}, - }, - true, - ModelProgressing), - expectedState: ModelTerminating, + modelVersion: &db.ModelVersion{ + Server: "server", + ModelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 2}}, + Version: 2, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Loading, Timestamp: timestamppb.New(d1)}, + 1: {State: db.ModelReplicaState_Loading, Timestamp: timestamppb.New(d2)}, + }, + State: &db.ModelStatus{State: db.ModelState_ModelProgressing}, + }, + expectedState: db.ModelState_ModelTerminating, expectedAvailableReplicas: 0, expectedReason: "", expectedTimestamp: d2, @@ -1811,17 +1814,17 @@ func TestUpdateModelStatus(t *testing.T) { { name: "Terminated", deleted: true, - modelVersion: NewModelVersion( - &pb.Model{ModelSpec: &pb.ModelSpec{}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 2}}, - 1, - "server", - map[int]ReplicaStatus{ - 0: {State: Unloaded, Reason: "", Timestamp: d1}, - 1: {State: Unloaded, Reason: "", Timestamp: d2}, - }, - true, - ModelProgressing), - expectedState: ModelTerminated, + modelVersion: &db.ModelVersion{ + Server: "server", + ModelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 2}}, + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Unloaded, Timestamp: timestamppb.New(d1)}, + 1: {State: db.ModelReplicaState_Unloaded, Timestamp: timestamppb.New(d2)}, + }, + State: &db.ModelStatus{State: db.ModelState_ModelProgressing}, + }, + expectedState: db.ModelState_ModelTerminated, expectedAvailableReplicas: 0, expectedReason: "", expectedTimestamp: d2, @@ -1829,17 +1832,17 @@ func TestUpdateModelStatus(t *testing.T) { { name: "TerminateFailed", deleted: true, - modelVersion: NewModelVersion( - &pb.Model{ModelSpec: &pb.ModelSpec{}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 2}}, - 1, - "server", - map[int]ReplicaStatus{ - 0: {State: UnloadFailed, Reason: r1, Timestamp: d1}, - 1: {State: Unloaded, Reason: "", Timestamp: d2}, - }, - true, - ModelProgressing), - expectedState: ModelTerminateFailed, + modelVersion: &db.ModelVersion{ + Server: "server", + ModelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 2}}, + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_UnloadFailed, Reason: r1, Timestamp: timestamppb.New(d1)}, + 1: {State: db.ModelReplicaState_Unloaded, Timestamp: timestamppb.New(d2)}, + }, + State: &db.ModelStatus{State: db.ModelState_ModelProgressing}, + }, + expectedState: db.ModelState_ModelTerminateFailed, expectedAvailableReplicas: 0, expectedReason: r1, expectedTimestamp: d1, @@ -1847,27 +1850,27 @@ func TestUpdateModelStatus(t *testing.T) { { name: "AvailableV2PrevTerminated", deleted: false, - modelVersion: NewModelVersion( - &pb.Model{ModelSpec: &pb.ModelSpec{}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 2}}, - 2, - "server", - map[int]ReplicaStatus{ - 0: {State: Available, Reason: "", Timestamp: d1}, - 1: {State: Available, Reason: "", Timestamp: d2}, - }, - false, - ModelProgressing), - prevAvailableModelVersion: NewModelVersion( - &pb.Model{ModelSpec: &pb.ModelSpec{}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 2}}, - 1, - "server", - map[int]ReplicaStatus{ - 0: {State: Available, Reason: "", Timestamp: d1}, - 1: {State: Available, Reason: "", Timestamp: d2}, - }, - false, - ModelTerminating), - expectedState: ModelAvailable, + modelVersion: &db.ModelVersion{ + Server: "server", + ModelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 2}}, + Version: 2, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Available, Timestamp: timestamppb.New(d1)}, + 1: {State: db.ModelReplicaState_Available, Timestamp: timestamppb.New(d2)}, + }, + State: &db.ModelStatus{State: db.ModelState_ModelProgressing}, + }, + prevAvailableModelVersion: &db.ModelVersion{ + Server: "server", + ModelDefn: &pb.Model{ModelSpec: &pb.ModelSpec{}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 2}}, + Version: 2, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Available, Timestamp: timestamppb.New(d1)}, + 1: {State: db.ModelReplicaState_Available, Timestamp: timestamppb.New(d2)}, + }, + State: &db.ModelStatus{State: db.ModelState_ModelTerminating}, + }, + expectedState: db.ModelState_ModelAvailable, expectedAvailableReplicas: 2, expectedReason: "", expectedTimestamp: d2, @@ -1878,30 +1881,32 @@ func TestUpdateModelStatus(t *testing.T) { logger := log.New() eventHub, err := coordinator.NewEventHub(logger) g.Expect(err).To(BeNil()) - modelStorage := NewInMemoryStorage[*db.Model]() - serverStorage := NewInMemoryStorage[*db.Server]() - ms := NewModelServerStore(logger, modelStorage, serverStorage, eventHub) - ms.updateModelStatus(true, test.deleted, test.modelVersion, test.prevAvailableModelVersion) - g.Expect(test.modelVersion.state.State).To(Equal(test.expectedState)) - g.Expect(test.modelVersion.state.Reason).To(Equal(test.expectedReason)) - g.Expect(test.modelVersion.state.AvailableReplicas).To(Equal(test.expectedAvailableReplicas)) - g.Expect(test.modelVersion.state.Timestamp).To(Equal(test.expectedTimestamp)) + const modelName = "some-model" + + modelStore := NewInMemoryStorage[*db.Model]() + err = modelStore.Insert(context.TODO(), &db.Model{Name: modelName}) + g.Expect(err).To(BeNil()) + + ms := NewModelServerStore(logger, modelStore, NewInMemoryStorage[*db.Server](), eventHub) + err = ms.updateModelStatus(true, test.deleted, test.modelVersion, test.prevAvailableModelVersion, &db.Model{ + Name: modelName, + }) + g.Expect(err).To(BeNil()) + g.Expect(test.modelVersion.State.State).To(Equal(test.expectedState)) + g.Expect(test.modelVersion.State.Reason).To(Equal(test.expectedReason)) + g.Expect(test.modelVersion.State.AvailableReplicas).To(Equal(test.expectedAvailableReplicas)) + g.Expect(test.modelVersion.State.Timestamp.AsTime()).To(Equal(test.expectedTimestamp)) }) } } -// TODO: This test requires refactoring for the new storage architecture. -// It tests internal implementation details (addModelVersionIfNotExists) that work directly -// with LocalSchedulerStore. The new architecture uses Storage interfaces and may need -// a different approach to testing these internal methods. -/* func TestAddModelVersionIfNotExists(t *testing.T) { g := NewGomegaWithT(t) type test struct { name string - store *LocalSchedulerStore + models []*db.Model modelVersion *agent.ModelVersion expected []uint32 latest uint32 @@ -1909,70 +1914,8 @@ func TestAddModelVersionIfNotExists(t *testing.T) { tests := []test{ { - name: "Add new version when none exist", - store: &LocalSchedulerStore{ - models: map[string]*Model{}, - }, - modelVersion: &agent.ModelVersion{ - Version: 1, - Model: &pb.Model{ - Meta: &pb.MetaData{Name: "foo"}, - }, - }, - expected: []uint32{1}, - latest: 1, - }, - { - name: "AddNewVersion", - store: &LocalSchedulerStore{ - models: map[string]*Model{"foo": { - versions: []*ModelVersion{}, - }}, - }, - modelVersion: &agent.ModelVersion{ - Version: 1, - Model: &pb.Model{ - Meta: &pb.MetaData{Name: "foo"}, - }, - }, - expected: []uint32{1}, - latest: 1, - }, - { - name: "AddSecondVersion", - store: &LocalSchedulerStore{ - models: map[string]*Model{"foo": { - versions: []*ModelVersion{ - { - version: 1, - modelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, - replicas: map[int]ReplicaStatus{}, - }, - }, - }}, - }, - modelVersion: &agent.ModelVersion{ - Version: 2, - Model: &pb.Model{ - Meta: &pb.MetaData{Name: "foo"}, - }, - }, - expected: []uint32{1, 2}, - latest: 2, - }, - { - name: "Existing", - store: &LocalSchedulerStore{ - models: map[string]*Model{"foo": { - versions: []*ModelVersion{ - { - version: 1, - modelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, - replicas: map[int]ReplicaStatus{}, - }, - }, - }}, - }, + name: "Add new version when none exist", + models: []*db.Model{}, modelVersion: &agent.ModelVersion{ Version: 1, Model: &pb.Model{ @@ -1982,443 +1925,120 @@ func TestAddModelVersionIfNotExists(t *testing.T) { expected: []uint32{1}, latest: 1, }, - { - name: "AddThirdVersion", - store: &LocalSchedulerStore{ - models: map[string]*Model{"foo": { - versions: []*ModelVersion{ - { - version: 1, - modelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, - replicas: map[int]ReplicaStatus{}, - }, - { - version: 2, - modelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, - replicas: map[int]ReplicaStatus{}, - }, - }, - }}, - }, - modelVersion: &agent.ModelVersion{ - Version: 3, - Model: &pb.Model{ - Meta: &pb.MetaData{Name: "foo"}, - }, - }, - expected: []uint32{1, 2, 3}, - latest: 3, - }, - { - name: "AddThirdVersionInMiddle", - store: &LocalSchedulerStore{ - models: map[string]*Model{"foo": { - versions: []*ModelVersion{ - { - version: 1, - modelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, - replicas: map[int]ReplicaStatus{}, - }, - { - version: 3, - modelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, - replicas: map[int]ReplicaStatus{}, - }, - }, - }}, - }, - modelVersion: &agent.ModelVersion{ - Version: 2, - Model: &pb.Model{ - Meta: &pb.MetaData{Name: "foo"}, - }, - }, - expected: []uint32{1, 2, 3}, - latest: 3, - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - logger := log.New() - eventHub, err := coordinator.NewEventHub(logger) - g.Expect(err).To(BeNil()) - ms := NewModelServerStore(logger, test.store, eventHub) - ms.addModelVersionIfNotExists(test.modelVersion) - modelName := test.modelVersion.GetModel().GetMeta().GetName() - g.Expect(test.store.models[modelName].GetVersions()).To(Equal(test.expected)) - g.Expect(test.store.models[modelName].Latest().version).To(Equal(test.latest)) - }) - } -} - -func TestAddServerReplica(t *testing.T) { - g := NewGomegaWithT(t) - ctx := context.Background() - - type test struct { - name string - models []*db.Model - servers []*db.Server - req *agent.AgentSubscribeRequest - expectedSnapshot []*ServerSnapshot - expectedModelEvents int64 - expectedServerEvents int64 - } - - tests := []test{ - { - name: "AddServerReplica - existing server", - models: []*db.Model{}, - servers: []*db.Server{ - { - Name: "server1", - Replicas: map[int32]*db.ServerReplica{ - 0: {}, - 1: {}, - }, - ExpectedReplicas: 3, - Shared: true, - }, - }, - req: &agent.AgentSubscribeRequest{ - ServerName: "server1", - ReplicaIdx: 2, - Shared: true, - }, - expectedSnapshot: []*ServerSnapshot{ - { - Name: "server1", - Replicas: map[int]*ServerReplica{ - 0: {}, - 1: {}, - 2: {}, - }, - ExpectedReplicas: 3, - Shared: true, - }, - }, - expectedModelEvents: 0, - expectedServerEvents: 1, - }, - { - name: "AddServerReplica - new server", - models: []*db.Model{}, - servers: []*db.Server{}, - req: &agent.AgentSubscribeRequest{ - ServerName: "server1", - ReplicaIdx: 0, - Shared: true, - }, - expectedSnapshot: []*ServerSnapshot{ - { - Name: "server1", - Replicas: map[int]*ServerReplica{ - 0: {}, - }, - ExpectedReplicas: -1, // expected replicas is not set - Shared: true, - }, - }, - expectedModelEvents: 0, - expectedServerEvents: 1, - }, - { - name: "AddServerReplica - with loaded models", - models: []*db.Model{}, - servers: []*db.Server{}, - req: &agent.AgentSubscribeRequest{ - ServerName: "server1", - ReplicaIdx: 0, - Shared: true, - LoadedModels: []*agent.ModelVersion{ - { - Model: &pb.Model{ - Meta: &pb.MetaData{Name: "model1"}, - ModelSpec: &pb.ModelSpec{}, - }, - Version: 1, - }, - { - Model: &pb.Model{ - Meta: &pb.MetaData{Name: "model2"}, - ModelSpec: &pb.ModelSpec{}, - }, - Version: 1, - }, - }, - }, - expectedSnapshot: []*ServerSnapshot{ - { - Name: "server1", - Replicas: map[int]*ServerReplica{ - 0: {}, - }, - ExpectedReplicas: -1, // expected replicas is not set - Shared: true, - }, - }, - expectedModelEvents: 2, - expectedServerEvents: 1, - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - logger := log.New() - eventHub, err := coordinator.NewEventHub(logger) - g.Expect(err).To(BeNil()) - - // Create storage instances - modelStorage := NewInMemoryStorage[*db.Model]() - serverStorage := NewInMemoryStorage[*db.Server]() - - // Populate storage with test data - for _, model := range test.models { - err := modelStorage.Insert(ctx, model) - g.Expect(err).To(BeNil()) - } - for _, server := range test.servers { - err := serverStorage.Insert(ctx, server) - g.Expect(err).To(BeNil()) - } - - // Create MemoryStore with populated storage - ms := NewModelServerStore(logger, modelStorage, serverStorage, eventHub) - - // register a callback to check if the event is triggered - serverEvents := int64(0) - eventHub.RegisterServerEventHandler( - "handler-server", - 10, - logger, - func(event coordinator.ServerEventMsg) { atomic.AddInt64(&serverEvents, 1) }, - ) - - modelEvents := int64(0) - eventHub.RegisterModelEventHandler( - "handler-model", - 10, - logger, - func(event coordinator.ModelEventMsg) { atomic.AddInt64(&modelEvents, 1) }, - ) - - err = ms.AddServerReplica(test.req) - g.Expect(err).To(BeNil()) - actualSnapshot, err := ms.GetServers(true, false) - g.Expect(err).To(BeNil()) - for idx, server := range actualSnapshot { - g.Expect(server.Name).To(Equal(test.expectedSnapshot[idx].Name)) - g.Expect(server.Shared).To(Equal(test.expectedSnapshot[idx].Shared)) - g.Expect(server.ExpectedReplicas).To(Equal(test.expectedSnapshot[idx].ExpectedReplicas)) - g.Expect(len(server.Replicas)).To(Equal(len(test.expectedSnapshot[idx].Replicas))) - } - - time.Sleep(10 * time.Millisecond) - g.Expect(atomic.LoadInt64(&serverEvents)).To(Equal(test.expectedServerEvents)) - g.Expect(atomic.LoadInt64(&modelEvents)).To(Equal(test.expectedModelEvents)) - }) - } -} - -func TestRemoveServerReplica(t *testing.T) { - g := NewGomegaWithT(t) - ctx := context.Background() - - type test struct { - name string - models []*db.Model - servers []*db.Server - serverName string - replicaIdx int - serverExists bool - modelsReturned int - } - - tests := []test{ - { - name: "ReplicaRemovedButNotDeleted", - models: []*db.Model{}, - servers: []*db.Server{ - { - Name: "server1", - Replicas: map[int32]*db.ServerReplica{ - 0: { - LoadedModels: []*db.ModelVersionID{{Name: "model1", Version: 1}}, - }, - 1: {}, - }, - ExpectedReplicas: 2, - Shared: true, - }, - }, - serverName: "server1", - replicaIdx: 0, - serverExists: true, - modelsReturned: 0, // no models really defined in store - }, - { - name: "ReplicaRemovedAndDeleted", - models: []*db.Model{ - { - Name: "model1", - Versions: []*db.ModelVersion{ - { - Version: 1, - Replicas: make(map[int32]*db.ReplicaStatus), - }, - }, - }, - }, - servers: []*db.Server{ - { - Name: "server1", - Replicas: map[int32]*db.ServerReplica{ - 0: { - LoadedModels: []*db.ModelVersionID{{Name: "model1", Version: 1}}, - }, - 1: {}, - }, - ExpectedReplicas: -1, - Shared: true, - }, - }, - serverName: "server1", - replicaIdx: 0, - serverExists: true, - modelsReturned: 1, - }, - { - name: "ReplicaRemovedAndServerDeleted", - models: []*db.Model{ - { - Name: "model1", - Versions: []*db.ModelVersion{ - { - Version: 1, - Replicas: make(map[int32]*db.ReplicaStatus), - }, - }, - }, - }, - servers: []*db.Server{ - { - Name: "server1", - Replicas: map[int32]*db.ServerReplica{ - 0: { - LoadedModels: []*db.ModelVersionID{{Name: "model1", Version: 1}}, - }, - }, - ExpectedReplicas: 0, - Shared: true, - }, - }, - serverName: "server1", - replicaIdx: 0, - serverExists: false, - modelsReturned: 1, - }, - { - name: "ReplicaRemovedAndServerDeleted but no model version in store", - models: []*db.Model{}, - servers: []*db.Server{ - { - Name: "server1", - Replicas: map[int32]*db.ServerReplica{ - 0: { - LoadedModels: []*db.ModelVersionID{{Name: "model1", Version: 1}}, - }, - }, - ExpectedReplicas: 0, - Shared: true, - }, - }, - serverName: "server1", - replicaIdx: 0, - serverExists: false, - modelsReturned: 0, - }, - { - name: "ReplicaRemovedAndDeleted - loading models", - models: []*db.Model{ - { - Name: "model1", - Versions: []*db.ModelVersion{ - { - Version: 1, - Replicas: make(map[int32]*db.ReplicaStatus), - }, - }, - }, - { - Name: "model2", - Versions: []*db.ModelVersion{ - { - Version: 1, - Replicas: make(map[int32]*db.ReplicaStatus), - }, - }, - }, - }, - servers: []*db.Server{ - { - Name: "server1", - Replicas: map[int32]*db.ServerReplica{ - 0: { - LoadedModels: []*db.ModelVersionID{ - {Name: "model1", Version: 1}, - {Name: "model2", Version: 1}, - }, - }, - 1: {}, - }, - ExpectedReplicas: -1, - Shared: true, - }, - }, - serverName: "server1", - replicaIdx: 0, - serverExists: true, - modelsReturned: 2, - }, - { - name: "ReplicaRemovedAndDeleted - non latest models", - models: []*db.Model{ - { - Name: "model1", - Versions: []*db.ModelVersion{ - { - Version: 1, - Replicas: map[int32]*db.ReplicaStatus{ - 0: {State: db.ModelReplicaState_Loaded}, - }, - }, - { - Version: 2, - Replicas: map[int32]*db.ReplicaStatus{ - 0: {State: db.ModelReplicaState_LoadFailed}, - }, - }, - }, - }, - }, - servers: []*db.Server{ - { - Name: "server1", - Replicas: map[int32]*db.ServerReplica{ - 0: { - LoadedModels: []*db.ModelVersionID{{Name: "model1", Version: 1}}, - }, - }, - ExpectedReplicas: -1, - Shared: true, - }, - }, - serverName: "server1", - replicaIdx: 0, - serverExists: false, - modelsReturned: 0, - }, + //{ + // name: "AddNewVersion", + // store: &LocalSchedulerStore{ + // models: map[string]*Model{"foo": { + // versions: []*ModelVersion{}, + // }}, + // }, + // modelVersion: &agent.ModelVersion{ + // Version: 1, + // Model: &pb.Model{ + // Meta: &pb.MetaData{Name: "foo"}, + // }, + // }, + // expected: []uint32{1}, + // latest: 1, + //}, + //{ + // name: "AddSecondVersion", + // store: &LocalSchedulerStore{ + // models: map[string]*Model{"foo": { + // versions: []*ModelVersion{ + // { + // version: 1, + // modelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, + // replicas: map[int]ReplicaStatus{}, + // }, + // }, + // }}, + // }, + // modelVersion: &agent.ModelVersion{ + // Version: 2, + // Model: &pb.Model{ + // Meta: &pb.MetaData{Name: "foo"}, + // }, + // }, + // expected: []uint32{1, 2}, + // latest: 2, + //}, + //{ + // name: "Existing", + // store: &LocalSchedulerStore{ + // models: map[string]*Model{"foo": { + // versions: []*ModelVersion{ + // { + // version: 1, + // modelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, + // replicas: map[int]ReplicaStatus{}, + // }, + // }, + // }}, + // }, + // modelVersion: &agent.ModelVersion{ + // Version: 1, + // Model: &pb.Model{ + // Meta: &pb.MetaData{Name: "foo"}, + // }, + // }, + // expected: []uint32{1}, + // latest: 1, + //}, + //{ + // name: "AddThirdVersion", + // store: &LocalSchedulerStore{ + // models: map[string]*Model{"foo": { + // versions: []*ModelVersion{ + // { + // version: 1, + // modelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, + // replicas: map[int]ReplicaStatus{}, + // }, + // { + // version: 2, + // modelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, + // replicas: map[int]ReplicaStatus{}, + // }, + // }, + // }}, + // }, + // modelVersion: &agent.ModelVersion{ + // Version: 3, + // Model: &pb.Model{ + // Meta: &pb.MetaData{Name: "foo"}, + // }, + // }, + // expected: []uint32{1, 2, 3}, + // latest: 3, + //}, + //{ + // name: "AddThirdVersionInMiddle", + // store: &LocalSchedulerStore{ + // models: map[string]*Model{"foo": { + // versions: []*ModelVersion{ + // { + // version: 1, + // modelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, + // replicas: map[int]ReplicaStatus{}, + // }, + // { + // version: 3, + // modelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, + // replicas: map[int]ReplicaStatus{}, + // }, + // }, + // }}, + // }, + // modelVersion: &agent.ModelVersion{ + // Version: 2, + // Model: &pb.Model{ + // Meta: &pb.MetaData{Name: "foo"}, + // }, + // }, + // expected: []uint32{1, 2, 3}, + // latest: 3, + //}, } for _, test := range tests { @@ -2427,185 +2047,583 @@ func TestRemoveServerReplica(t *testing.T) { eventHub, err := coordinator.NewEventHub(logger) g.Expect(err).To(BeNil()) - // Create storage instances modelStorage := NewInMemoryStorage[*db.Model]() serverStorage := NewInMemoryStorage[*db.Server]() // Populate storage with test data for _, model := range test.models { - err := modelStorage.Insert(ctx, model) + err := modelStorage.Insert(context.TODO(), model) g.Expect(err).To(BeNil()) } - for _, server := range test.servers { - err := serverStorage.Insert(ctx, server) + getModel := func(name string) *db.Model { + model, err := modelStorage.Get(context.TODO(), name) g.Expect(err).To(BeNil()) + return model } - // Create MemoryStore with populated storage ms := NewModelServerStore(logger, modelStorage, serverStorage, eventHub) - - models, err := ms.RemoveServerReplica(test.serverName, test.replicaIdx) - g.Expect(err).To(BeNil()) - g.Expect(test.modelsReturned).To(Equal(len(models))) - server, _, err := ms.GetServer(test.serverName, false, true) - if test.serverExists { - g.Expect(err).To(BeNil()) - g.Expect(server).ToNot(BeNil()) - } else { - g.Expect(err).ToNot(BeNil()) - g.Expect(server).To(BeNil()) - } + ms.addModelVersionIfNotExists(test.modelVersion) + modelName := test.modelVersion.GetModel().GetMeta().GetName() + g.Expect(getModel(modelName).GetVersions()).To(Equal(test.expected)) + g.Expect(getModel(modelName).Latest().Version).To(Equal(test.latest)) }) } } -func TestDrainServerReplica(t *testing.T) { - g := NewGomegaWithT(t) - ctx := context.Background() - - type test struct { - name string - models []*db.Model - servers []*db.Server - serverName string - replicaIdx int - modelsReturned []string - } - - // if we have models returned check status is Draining - tests := []test{ - { - name: "ReplicaSetDrainingNoModels", - servers: []*db.Server{ - { - Name: "server1", - Replicas: map[int32]*db.ServerReplica{ - 0: {}, - 1: {}, - }, - ExpectedReplicas: 2, - Shared: true, - }, - }, - serverName: "server1", - replicaIdx: 0, - modelsReturned: []string{}, - }, - { - name: "ReplicaSetDrainingWithLoadedModels", - models: []*db.Model{ - { - Name: "model1", - Versions: []*db.ModelVersion{ - { - Version: 1, - Replicas: map[int32]*db.ReplicaStatus{0: {State: db.ModelReplicaState_Loaded}}, - }, - }, - }, - }, - servers: []*db.Server{ - { - Name: "server1", - Replicas: map[int32]*db.ServerReplica{ - 0: {LoadedModels: []*db.ModelVersionID{{Name: "model1", Version: 1}}}, - 1: {}, - }, - ExpectedReplicas: -1, - Shared: true, - }, - }, - serverName: "server1", - replicaIdx: 0, - modelsReturned: []string{"model1"}, - }, - { - name: "ReplicaSetDrainingWithLoadedAndLoadingModels", - models: []*db.Model{ - { - Name: "model1", - Versions: []*db.ModelVersion{ - { - Version: 1, - Replicas: map[int32]*db.ReplicaStatus{ - 0: {State: db.ModelReplicaState_Loaded}}, - }, - }, - }, - { - Name: "model2", - Versions: []*db.ModelVersion{ - { - Version: 1, - Replicas: map[int32]*db.ReplicaStatus{ - 0: {State: db.ModelReplicaState_Loading}}, - }, - }, - }, - }, - servers: []*db.Server{ - { - Name: "server1", - Replicas: map[int32]*db.ServerReplica{ - 0: { - LoadedModels: []*db.ModelVersionID{ - {Name: "model1", Version: 1}, - }, - LoadingModels: []*db.ModelVersionID{ - {Name: "model2", Version: 1}, - }, - }, - 1: {}, - }, - ExpectedReplicas: -1, - Shared: true, - }, - }, - serverName: "server1", - replicaIdx: 0, - modelsReturned: []string{"model1", "model2"}, - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - logger := log.New() - eventHub, err := coordinator.NewEventHub(logger) - g.Expect(err).To(BeNil()) - - // Create storage instances - modelStorage := NewInMemoryStorage[*db.Model]() - serverStorage := NewInMemoryStorage[*db.Server]() - - // Populate storage with test data - for _, model := range test.models { - err := modelStorage.Insert(ctx, model) - g.Expect(err).To(BeNil()) - } - for _, server := range test.servers { - err := serverStorage.Insert(ctx, server) - g.Expect(err).To(BeNil()) - } - - // Create MemoryStore with populated storage - ms := NewModelServerStore(logger, modelStorage, serverStorage, eventHub) - - models, err := ms.DrainServerReplica(test.serverName, test.replicaIdx) - g.Expect(err).To(BeNil()) - g.Expect(test.modelsReturned).To(Equal(models)) - server, _, err := ms.GetServer(test.serverName, false, true) - g.Expect(err).To(BeNil()) - g.Expect(server).ToNot(BeNil()) - g.Expect(server.Replicas[int32(test.replicaIdx)].GetIsDraining()).To(BeTrue()) - - if test.modelsReturned != nil { - for _, model := range test.modelsReturned { - dbModel, _ := ms.GetModel(model) - state := dbModel.Latest().GetModelReplicaState(test.replicaIdx) - g.Expect(state).To(Equal(db.ModelReplicaState_Draining)) - } - } - }) - } -} -*/ +// +//func TestAddServerReplica(t *testing.T) { +// g := NewGomegaWithT(t) +// ctx := context.Background() +// +// type test struct { +// name string +// models []*db.Model +// servers []*db.Server +// req *agent.AgentSubscribeRequest +// expectedSnapshot []*ServerSnapshot +// expectedModelEvents int64 +// expectedServerEvents int64 +// } +// +// tests := []test{ +// { +// name: "AddServerReplica - existing server", +// models: []*db.Model{}, +// servers: []*db.Server{ +// { +// Name: "server1", +// Replicas: map[int32]*db.ServerReplica{ +// 0: {}, +// 1: {}, +// }, +// ExpectedReplicas: 3, +// Shared: true, +// }, +// }, +// req: &agent.AgentSubscribeRequest{ +// ServerName: "server1", +// ReplicaIdx: 2, +// Shared: true, +// }, +// expectedSnapshot: []*ServerSnapshot{ +// { +// Name: "server1", +// Replicas: map[int]*ServerReplica{ +// 0: {}, +// 1: {}, +// 2: {}, +// }, +// ExpectedReplicas: 3, +// Shared: true, +// }, +// }, +// expectedModelEvents: 0, +// expectedServerEvents: 1, +// }, +// { +// name: "AddServerReplica - new server", +// models: []*db.Model{}, +// servers: []*db.Server{}, +// req: &agent.AgentSubscribeRequest{ +// ServerName: "server1", +// ReplicaIdx: 0, +// Shared: true, +// }, +// expectedSnapshot: []*ServerSnapshot{ +// { +// Name: "server1", +// Replicas: map[int]*ServerReplica{ +// 0: {}, +// }, +// ExpectedReplicas: -1, // expected replicas is not set +// Shared: true, +// }, +// }, +// expectedModelEvents: 0, +// expectedServerEvents: 1, +// }, +// { +// name: "AddServerReplica - with loaded models", +// models: []*db.Model{}, +// servers: []*db.Server{}, +// req: &agent.AgentSubscribeRequest{ +// ServerName: "server1", +// ReplicaIdx: 0, +// Shared: true, +// LoadedModels: []*agent.ModelVersion{ +// { +// Model: &pb.Model{ +// Meta: &pb.MetaData{Name: "model1"}, +// ModelSpec: &pb.ModelSpec{}, +// }, +// Version: 1, +// }, +// { +// Model: &pb.Model{ +// Meta: &pb.MetaData{Name: "model2"}, +// ModelSpec: &pb.ModelSpec{}, +// }, +// Version: 1, +// }, +// }, +// }, +// expectedSnapshot: []*ServerSnapshot{ +// { +// Name: "server1", +// Replicas: map[int]*ServerReplica{ +// 0: {}, +// }, +// ExpectedReplicas: -1, // expected replicas is not set +// Shared: true, +// }, +// }, +// expectedModelEvents: 2, +// expectedServerEvents: 1, +// }, +// } +// +// for _, test := range tests { +// t.Run(test.name, func(t *testing.T) { +// logger := log.New() +// eventHub, err := coordinator.NewEventHub(logger) +// g.Expect(err).To(BeNil()) +// +// // Create storage instances +// modelStorage := NewInMemoryStorage[*db.Model]() +// serverStorage := NewInMemoryStorage[*db.Server]() +// +// // Populate storage with test data +// for _, model := range test.models { +// err := modelStorage.Insert(ctx, model) +// g.Expect(err).To(BeNil()) +// } +// for _, server := range test.servers { +// err := serverStorage.Insert(ctx, server) +// g.Expect(err).To(BeNil()) +// } +// +// // Create MemoryStore with populated storage +// ms := NewModelServerStore(logger, modelStorage, serverStorage, eventHub) +// +// // register a callback to check if the event is triggered +// serverEvents := int64(0) +// eventHub.RegisterServerEventHandler( +// "handler-server", +// 10, +// logger, +// func(event coordinator.ServerEventMsg) { atomic.AddInt64(&serverEvents, 1) }, +// ) +// +// modelEvents := int64(0) +// eventHub.RegisterModelEventHandler( +// "handler-model", +// 10, +// logger, +// func(event coordinator.ModelEventMsg) { atomic.AddInt64(&modelEvents, 1) }, +// ) +// +// err = ms.AddServerReplica(test.req) +// g.Expect(err).To(BeNil()) +// actualSnapshot, err := ms.GetServers(true, false) +// g.Expect(err).To(BeNil()) +// for idx, server := range actualSnapshot { +// g.Expect(server.Name).To(Equal(test.expectedSnapshot[idx].Name)) +// g.Expect(server.Shared).To(Equal(test.expectedSnapshot[idx].Shared)) +// g.Expect(server.ExpectedReplicas).To(Equal(test.expectedSnapshot[idx].ExpectedReplicas)) +// g.Expect(len(server.Replicas)).To(Equal(len(test.expectedSnapshot[idx].Replicas))) +// } +// +// time.Sleep(10 * time.Millisecond) +// g.Expect(atomic.LoadInt64(&serverEvents)).To(Equal(test.expectedServerEvents)) +// g.Expect(atomic.LoadInt64(&modelEvents)).To(Equal(test.expectedModelEvents)) +// }) +// } +//} +// +//func TestRemoveServerReplica(t *testing.T) { +// g := NewGomegaWithT(t) +// ctx := context.Background() +// +// type test struct { +// name string +// models []*db.Model +// servers []*db.Server +// serverName string +// replicaIdx int +// serverExists bool +// modelsReturned int +// } +// +// tests := []test{ +// { +// name: "ReplicaRemovedButNotDeleted", +// models: []*db.Model{}, +// servers: []*db.Server{ +// { +// Name: "server1", +// Replicas: map[int32]*db.ServerReplica{ +// 0: { +// LoadedModels: []*db.ModelVersionID{{Name: "model1", Version: 1}}, +// }, +// 1: {}, +// }, +// ExpectedReplicas: 2, +// Shared: true, +// }, +// }, +// serverName: "server1", +// replicaIdx: 0, +// serverExists: true, +// modelsReturned: 0, // no models really defined in store +// }, +// { +// name: "ReplicaRemovedAndDeleted", +// models: []*db.Model{ +// { +// Name: "model1", +// Versions: []*db.ModelVersion{ +// { +// Version: 1, +// Replicas: make(map[int32]*db.ReplicaStatus), +// }, +// }, +// }, +// }, +// servers: []*db.Server{ +// { +// Name: "server1", +// Replicas: map[int32]*db.ServerReplica{ +// 0: { +// LoadedModels: []*db.ModelVersionID{{Name: "model1", Version: 1}}, +// }, +// 1: {}, +// }, +// ExpectedReplicas: -1, +// Shared: true, +// }, +// }, +// serverName: "server1", +// replicaIdx: 0, +// serverExists: true, +// modelsReturned: 1, +// }, +// { +// name: "ReplicaRemovedAndServerDeleted", +// models: []*db.Model{ +// { +// Name: "model1", +// Versions: []*db.ModelVersion{ +// { +// Version: 1, +// Replicas: make(map[int32]*db.ReplicaStatus), +// }, +// }, +// }, +// }, +// servers: []*db.Server{ +// { +// Name: "server1", +// Replicas: map[int32]*db.ServerReplica{ +// 0: { +// LoadedModels: []*db.ModelVersionID{{Name: "model1", Version: 1}}, +// }, +// }, +// ExpectedReplicas: 0, +// Shared: true, +// }, +// }, +// serverName: "server1", +// replicaIdx: 0, +// serverExists: false, +// modelsReturned: 1, +// }, +// { +// name: "ReplicaRemovedAndServerDeleted but no model version in store", +// models: []*db.Model{}, +// servers: []*db.Server{ +// { +// Name: "server1", +// Replicas: map[int32]*db.ServerReplica{ +// 0: { +// LoadedModels: []*db.ModelVersionID{{Name: "model1", Version: 1}}, +// }, +// }, +// ExpectedReplicas: 0, +// Shared: true, +// }, +// }, +// serverName: "server1", +// replicaIdx: 0, +// serverExists: false, +// modelsReturned: 0, +// }, +// { +// name: "ReplicaRemovedAndDeleted - loading models", +// models: []*db.Model{ +// { +// Name: "model1", +// Versions: []*db.ModelVersion{ +// { +// Version: 1, +// Replicas: make(map[int32]*db.ReplicaStatus), +// }, +// }, +// }, +// { +// Name: "model2", +// Versions: []*db.ModelVersion{ +// { +// Version: 1, +// Replicas: make(map[int32]*db.ReplicaStatus), +// }, +// }, +// }, +// }, +// servers: []*db.Server{ +// { +// Name: "server1", +// Replicas: map[int32]*db.ServerReplica{ +// 0: { +// LoadedModels: []*db.ModelVersionID{ +// {Name: "model1", Version: 1}, +// {Name: "model2", Version: 1}, +// }, +// }, +// 1: {}, +// }, +// ExpectedReplicas: -1, +// Shared: true, +// }, +// }, +// serverName: "server1", +// replicaIdx: 0, +// serverExists: true, +// modelsReturned: 2, +// }, +// { +// name: "ReplicaRemovedAndDeleted - non latest models", +// models: []*db.Model{ +// { +// Name: "model1", +// Versions: []*db.ModelVersion{ +// { +// Version: 1, +// Replicas: map[int32]*db.ReplicaStatus{ +// 0: {State: db.ModelReplicaState_Loaded}, +// }, +// }, +// { +// Version: 2, +// Replicas: map[int32]*db.ReplicaStatus{ +// 0: {State: db.ModelReplicaState_LoadFailed}, +// }, +// }, +// }, +// }, +// }, +// servers: []*db.Server{ +// { +// Name: "server1", +// Replicas: map[int32]*db.ServerReplica{ +// 0: { +// LoadedModels: []*db.ModelVersionID{{Name: "model1", Version: 1}}, +// }, +// }, +// ExpectedReplicas: -1, +// Shared: true, +// }, +// }, +// serverName: "server1", +// replicaIdx: 0, +// serverExists: false, +// modelsReturned: 0, +// }, +// } +// +// for _, test := range tests { +// t.Run(test.name, func(t *testing.T) { +// logger := log.New() +// eventHub, err := coordinator.NewEventHub(logger) +// g.Expect(err).To(BeNil()) +// +// // Create storage instances +// modelStorage := NewInMemoryStorage[*db.Model]() +// serverStorage := NewInMemoryStorage[*db.Server]() +// +// // Populate storage with test data +// for _, model := range test.models { +// err := modelStorage.Insert(ctx, model) +// g.Expect(err).To(BeNil()) +// } +// for _, server := range test.servers { +// err := serverStorage.Insert(ctx, server) +// g.Expect(err).To(BeNil()) +// } +// +// // Create MemoryStore with populated storage +// ms := NewModelServerStore(logger, modelStorage, serverStorage, eventHub) +// +// models, err := ms.RemoveServerReplica(test.serverName, test.replicaIdx) +// g.Expect(err).To(BeNil()) +// g.Expect(test.modelsReturned).To(Equal(len(models))) +// server, _, err := ms.GetServer(test.serverName, false, true) +// if test.serverExists { +// g.Expect(err).To(BeNil()) +// g.Expect(server).ToNot(BeNil()) +// } else { +// g.Expect(err).ToNot(BeNil()) +// g.Expect(server).To(BeNil()) +// } +// }) +// } +//} +// +//func TestDrainServerReplica(t *testing.T) { +// g := NewGomegaWithT(t) +// ctx := context.Background() +// +// type test struct { +// name string +// models []*db.Model +// servers []*db.Server +// serverName string +// replicaIdx int +// modelsReturned []string +// } +// +// // if we have models returned check status is Draining +// tests := []test{ +// { +// name: "ReplicaSetDrainingNoModels", +// servers: []*db.Server{ +// { +// Name: "server1", +// Replicas: map[int32]*db.ServerReplica{ +// 0: {}, +// 1: {}, +// }, +// ExpectedReplicas: 2, +// Shared: true, +// }, +// }, +// serverName: "server1", +// replicaIdx: 0, +// modelsReturned: []string{}, +// }, +// { +// name: "ReplicaSetDrainingWithLoadedModels", +// models: []*db.Model{ +// { +// Name: "model1", +// Versions: []*db.ModelVersion{ +// { +// Version: 1, +// Replicas: map[int32]*db.ReplicaStatus{0: {State: db.ModelReplicaState_Loaded}}, +// }, +// }, +// }, +// }, +// servers: []*db.Server{ +// { +// Name: "server1", +// Replicas: map[int32]*db.ServerReplica{ +// 0: {LoadedModels: []*db.ModelVersionID{{Name: "model1", Version: 1}}}, +// 1: {}, +// }, +// ExpectedReplicas: -1, +// Shared: true, +// }, +// }, +// serverName: "server1", +// replicaIdx: 0, +// modelsReturned: []string{"model1"}, +// }, +// { +// name: "ReplicaSetDrainingWithLoadedAndLoadingModels", +// models: []*db.Model{ +// { +// Name: "model1", +// Versions: []*db.ModelVersion{ +// { +// Version: 1, +// Replicas: map[int32]*db.ReplicaStatus{ +// 0: {State: db.ModelReplicaState_Loaded}}, +// }, +// }, +// }, +// { +// Name: "model2", +// Versions: []*db.ModelVersion{ +// { +// Version: 1, +// Replicas: map[int32]*db.ReplicaStatus{ +// 0: {State: db.ModelReplicaState_Loading}}, +// }, +// }, +// }, +// }, +// servers: []*db.Server{ +// { +// Name: "server1", +// Replicas: map[int32]*db.ServerReplica{ +// 0: { +// LoadedModels: []*db.ModelVersionID{ +// {Name: "model1", Version: 1}, +// }, +// LoadingModels: []*db.ModelVersionID{ +// {Name: "model2", Version: 1}, +// }, +// }, +// 1: {}, +// }, +// ExpectedReplicas: -1, +// Shared: true, +// }, +// }, +// serverName: "server1", +// replicaIdx: 0, +// modelsReturned: []string{"model1", "model2"}, +// }, +// } +// +// for _, test := range tests { +// t.Run(test.name, func(t *testing.T) { +// logger := log.New() +// eventHub, err := coordinator.NewEventHub(logger) +// g.Expect(err).To(BeNil()) +// +// // Create storage instances +// modelStorage := NewInMemoryStorage[*db.Model]() +// serverStorage := NewInMemoryStorage[*db.Server]() +// +// // Populate storage with test data +// for _, model := range test.models { +// err := modelStorage.Insert(ctx, model) +// g.Expect(err).To(BeNil()) +// } +// for _, server := range test.servers { +// err := serverStorage.Insert(ctx, server) +// g.Expect(err).To(BeNil()) +// } +// +// // Create MemoryStore with populated storage +// ms := NewModelServerStore(logger, modelStorage, serverStorage, eventHub) +// +// models, err := ms.DrainServerReplica(test.serverName, test.replicaIdx) +// g.Expect(err).To(BeNil()) +// g.Expect(test.modelsReturned).To(Equal(models)) +// server, _, err := ms.GetServer(test.serverName, false, true) +// g.Expect(err).To(BeNil()) +// g.Expect(server).ToNot(BeNil()) +// g.Expect(server.Replicas[int32(test.replicaIdx)].GetIsDraining()).To(BeTrue()) +// +// if test.modelsReturned != nil { +// for _, model := range test.modelsReturned { +// dbModel, _ := ms.GetModel(model) +// state := dbModel.Latest().GetModelReplicaState(test.replicaIdx) +// g.Expect(state).To(Equal(db.ModelReplicaState_Draining)) +// } +// } +// }) +// } +//} From c6b597fb6dfaf317fedd92f9ee2dc8cd5baf763a Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Mon, 12 Jan 2026 12:16:35 +0000 Subject: [PATCH 31/50] fix test and fix draining bug not picking up loaded models to re-schedule --- scheduler/pkg/store/memory.go | 2 +- scheduler/pkg/store/memory_test.go | 1438 +++++++++++++++------------- 2 files changed, 757 insertions(+), 683 deletions(-) diff --git a/scheduler/pkg/store/memory.go b/scheduler/pkg/store/memory.go index 62705d0e4c..cff9ebb414 100644 --- a/scheduler/pkg/store/memory.go +++ b/scheduler/pkg/store/memory.go @@ -989,7 +989,7 @@ func (m *ModelServerStore) drainServerReplicaImpl(serverName string, replicaIdx return nil, fmt.Errorf("failed to update server %s: %w", serverName, err) } - loadedModels, err := m.findModelsToReSchedule(serverReplica.LoadingModels, replicaIdx) + loadedModels, err := m.findModelsToReSchedule(serverReplica.LoadedModels, replicaIdx) if err != nil { return nil, err } diff --git a/scheduler/pkg/store/memory_test.go b/scheduler/pkg/store/memory_test.go index b770c13686..644755313a 100644 --- a/scheduler/pkg/store/memory_test.go +++ b/scheduler/pkg/store/memory_test.go @@ -13,6 +13,7 @@ import ( "context" "errors" "sync" + "sync/atomic" "testing" "time" @@ -1908,7 +1909,7 @@ func TestAddModelVersionIfNotExists(t *testing.T) { name string models []*db.Model modelVersion *agent.ModelVersion - expected []uint32 + expected []*db.ModelVersion latest uint32 } @@ -1922,123 +1923,195 @@ func TestAddModelVersionIfNotExists(t *testing.T) { Meta: &pb.MetaData{Name: "foo"}, }, }, - expected: []uint32{1}, - latest: 1, - }, - //{ - // name: "AddNewVersion", - // store: &LocalSchedulerStore{ - // models: map[string]*Model{"foo": { - // versions: []*ModelVersion{}, - // }}, - // }, - // modelVersion: &agent.ModelVersion{ - // Version: 1, - // Model: &pb.Model{ - // Meta: &pb.MetaData{Name: "foo"}, - // }, - // }, - // expected: []uint32{1}, - // latest: 1, - //}, - //{ - // name: "AddSecondVersion", - // store: &LocalSchedulerStore{ - // models: map[string]*Model{"foo": { - // versions: []*ModelVersion{ - // { - // version: 1, - // modelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, - // replicas: map[int]ReplicaStatus{}, - // }, - // }, - // }}, - // }, - // modelVersion: &agent.ModelVersion{ - // Version: 2, - // Model: &pb.Model{ - // Meta: &pb.MetaData{Name: "foo"}, - // }, - // }, - // expected: []uint32{1, 2}, - // latest: 2, - //}, - //{ - // name: "Existing", - // store: &LocalSchedulerStore{ - // models: map[string]*Model{"foo": { - // versions: []*ModelVersion{ - // { - // version: 1, - // modelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, - // replicas: map[int]ReplicaStatus{}, - // }, - // }, - // }}, - // }, - // modelVersion: &agent.ModelVersion{ - // Version: 1, - // Model: &pb.Model{ - // Meta: &pb.MetaData{Name: "foo"}, - // }, - // }, - // expected: []uint32{1}, - // latest: 1, - //}, - //{ - // name: "AddThirdVersion", - // store: &LocalSchedulerStore{ - // models: map[string]*Model{"foo": { - // versions: []*ModelVersion{ - // { - // version: 1, - // modelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, - // replicas: map[int]ReplicaStatus{}, - // }, - // { - // version: 2, - // modelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, - // replicas: map[int]ReplicaStatus{}, - // }, - // }, - // }}, - // }, - // modelVersion: &agent.ModelVersion{ - // Version: 3, - // Model: &pb.Model{ - // Meta: &pb.MetaData{Name: "foo"}, - // }, - // }, - // expected: []uint32{1, 2, 3}, - // latest: 3, - //}, - //{ - // name: "AddThirdVersionInMiddle", - // store: &LocalSchedulerStore{ - // models: map[string]*Model{"foo": { - // versions: []*ModelVersion{ - // { - // version: 1, - // modelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, - // replicas: map[int]ReplicaStatus{}, - // }, - // { - // version: 3, - // modelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, - // replicas: map[int]ReplicaStatus{}, - // }, - // }, - // }}, - // }, - // modelVersion: &agent.ModelVersion{ - // Version: 2, - // Model: &pb.Model{ - // Meta: &pb.MetaData{Name: "foo"}, - // }, - // }, - // expected: []uint32{1, 2, 3}, - // latest: 3, - //}, + expected: []*db.ModelVersion{ + { + Version: 1, + ModelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, + State: &db.ModelStatus{ModelGwState: db.ModelState_ModelCreate}, + }, + }, + latest: 1, + }, + { + name: "AddNewVersion", + models: []*db.Model{ + { + Name: "foo", + Versions: []*db.ModelVersion{}, + }, + }, + modelVersion: &agent.ModelVersion{ + Version: 1, + Model: &pb.Model{ + Meta: &pb.MetaData{Name: "foo"}, + }, + }, + expected: []*db.ModelVersion{ + { + Version: 1, + ModelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, + State: &db.ModelStatus{ModelGwState: db.ModelState_ModelCreate}, + }, + }, + latest: 1, + }, + { + name: "AddSecondVersion", + models: []*db.Model{ + { + Name: "foo", + Versions: []*db.ModelVersion{ + { + Version: 1, + ModelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, + Replicas: map[int32]*db.ReplicaStatus{}, + State: &db.ModelStatus{}, + }, + }, + }, + }, + modelVersion: &agent.ModelVersion{ + Version: 2, + Model: &pb.Model{ + Meta: &pb.MetaData{Name: "foo"}, + }, + }, + expected: []*db.ModelVersion{ + { + Version: 1, + ModelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, + State: &db.ModelStatus{}, + }, + { + Version: 2, + ModelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, + State: &db.ModelStatus{ModelGwState: db.ModelState_ModelCreate}, + }, + }, + latest: 2, + }, + { + name: "Existing", + models: []*db.Model{ + { + Name: "foo", + Versions: []*db.ModelVersion{ + { + Version: 1, + ModelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, + Replicas: map[int32]*db.ReplicaStatus{}, + State: &db.ModelStatus{}, + }, + }, + }, + }, + modelVersion: &agent.ModelVersion{ + Version: 1, + Model: &pb.Model{ + Meta: &pb.MetaData{Name: "foo"}, + }, + }, + expected: []*db.ModelVersion{ + { + Version: 1, + ModelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, + State: &db.ModelStatus{}, + }, + }, + latest: 1, + }, + { + name: "AddThirdVersion", + models: []*db.Model{ + { + Name: "foo", + Versions: []*db.ModelVersion{ + { + Version: 1, + ModelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, + Replicas: map[int32]*db.ReplicaStatus{}, + State: &db.ModelStatus{}, + }, + { + Version: 2, + ModelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, + Replicas: map[int32]*db.ReplicaStatus{}, + State: &db.ModelStatus{}, + }, + }, + }, + }, + modelVersion: &agent.ModelVersion{ + Version: 3, + Model: &pb.Model{ + Meta: &pb.MetaData{Name: "foo"}, + }, + }, + expected: []*db.ModelVersion{ + { + Version: 1, + ModelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, + State: &db.ModelStatus{}, + }, + { + Version: 2, + ModelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, + State: &db.ModelStatus{}, + }, + { + Version: 3, + ModelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, + State: &db.ModelStatus{ModelGwState: db.ModelState_ModelCreate}, + }, + }, + latest: 3, + }, + { + name: "AddThirdVersionInMiddle", + models: []*db.Model{ + { + Name: "foo", + Versions: []*db.ModelVersion{ + { + Version: 1, + ModelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, + Replicas: map[int32]*db.ReplicaStatus{}, + State: &db.ModelStatus{}, + }, + { + Version: 3, + ModelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, + Replicas: map[int32]*db.ReplicaStatus{}, + State: &db.ModelStatus{}, + }, + }, + }, + }, + modelVersion: &agent.ModelVersion{ + Version: 2, + Model: &pb.Model{ + Meta: &pb.MetaData{Name: "foo"}, + }, + }, + expected: []*db.ModelVersion{ + { + Version: 1, + ModelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, + State: &db.ModelStatus{}, + }, + { + Version: 2, + ModelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, + State: &db.ModelStatus{ModelGwState: db.ModelState_ModelCreate}, + }, + { + Version: 3, + ModelDefn: &pb.Model{Meta: &pb.MetaData{Name: "foo"}}, + State: &db.ModelStatus{}, + }, + }, + latest: 3, + }, } for _, test := range tests { @@ -2055,575 +2128,576 @@ func TestAddModelVersionIfNotExists(t *testing.T) { err := modelStorage.Insert(context.TODO(), model) g.Expect(err).To(BeNil()) } - getModel := func(name string) *db.Model { - model, err := modelStorage.Get(context.TODO(), name) + + ms := NewModelServerStore(logger, modelStorage, serverStorage, eventHub) + model, _, err := ms.addModelVersionIfNotExists(test.modelVersion) + g.Expect(err).To(BeNil()) + g.Expect(len(model.Versions)).To(Equal(len(test.expected))) + for i, modelVersion := range model.Versions { + g.Expect(proto.Equal(modelVersion, test.expected[i])).To(BeTrue()) + } + g.Expect(model.Latest().Version).To(Equal(test.latest)) + }) + } +} + +func TestAddServerReplica(t *testing.T) { + g := NewGomegaWithT(t) + ctx := context.Background() + + type test struct { + name string + models []*db.Model + servers []*db.Server + req *agent.AgentSubscribeRequest + expectedSnapshot []*db.Server + expectedModelEvents int64 + expectedServerEvents int64 + } + + tests := []test{ + { + name: "AddServerReplica - existing server", + models: []*db.Model{}, + servers: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: {}, + 1: {}, + }, + ExpectedReplicas: 3, + Shared: true, + }, + }, + req: &agent.AgentSubscribeRequest{ + ServerName: "server1", + ReplicaIdx: 2, + Shared: true, + }, + expectedSnapshot: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: {}, + 1: {}, + 2: {}, + }, + ExpectedReplicas: 3, + Shared: true, + }, + }, + expectedModelEvents: 0, + expectedServerEvents: 1, + }, + { + name: "AddServerReplica - new server", + models: []*db.Model{}, + servers: []*db.Server{}, + req: &agent.AgentSubscribeRequest{ + ServerName: "server1", + ReplicaIdx: 0, + Shared: true, + }, + expectedSnapshot: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: {}, + }, + ExpectedReplicas: -1, // expected replicas is not set + Shared: true, + }, + }, + expectedModelEvents: 0, + expectedServerEvents: 1, + }, + { + name: "AddServerReplica - with loaded models", + models: []*db.Model{}, + servers: []*db.Server{}, + req: &agent.AgentSubscribeRequest{ + ServerName: "server1", + ReplicaIdx: 0, + Shared: true, + LoadedModels: []*agent.ModelVersion{ + { + Model: &pb.Model{ + Meta: &pb.MetaData{Name: "model1"}, + ModelSpec: &pb.ModelSpec{}, + }, + Version: 1, + }, + { + Model: &pb.Model{ + Meta: &pb.MetaData{Name: "model2"}, + ModelSpec: &pb.ModelSpec{}, + }, + Version: 1, + }, + }, + }, + expectedSnapshot: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: {}, + }, + ExpectedReplicas: -1, // expected replicas is not set + Shared: true, + }, + }, + expectedModelEvents: 2, + expectedServerEvents: 1, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + logger := log.New() + eventHub, err := coordinator.NewEventHub(logger) + g.Expect(err).To(BeNil()) + + // Create storage instances + modelStorage := NewInMemoryStorage[*db.Model]() + serverStorage := NewInMemoryStorage[*db.Server]() + + // Populate storage with test data + for _, model := range test.models { + err := modelStorage.Insert(ctx, model) + g.Expect(err).To(BeNil()) + } + for _, server := range test.servers { + err := serverStorage.Insert(ctx, server) + g.Expect(err).To(BeNil()) + } + + // Create MemoryStore with populated storage + ms := NewModelServerStore(logger, modelStorage, serverStorage, eventHub) + + // register a callback to check if the event is triggered + serverEvents := int64(0) + eventHub.RegisterServerEventHandler( + "handler-server", + 10, + logger, + func(event coordinator.ServerEventMsg) { atomic.AddInt64(&serverEvents, 1) }, + ) + + modelEvents := int64(0) + eventHub.RegisterModelEventHandler( + "handler-model", + 10, + logger, + func(event coordinator.ModelEventMsg) { atomic.AddInt64(&modelEvents, 1) }, + ) + + err = ms.AddServerReplica(test.req) + g.Expect(err).To(BeNil()) + actualSnapshot, err := ms.GetServers() + g.Expect(err).To(BeNil()) + for idx, server := range actualSnapshot { + g.Expect(server.Name).To(Equal(test.expectedSnapshot[idx].Name)) + g.Expect(server.Shared).To(Equal(test.expectedSnapshot[idx].Shared)) + g.Expect(server.ExpectedReplicas).To(Equal(test.expectedSnapshot[idx].ExpectedReplicas)) + g.Expect(len(server.Replicas)).To(Equal(len(test.expectedSnapshot[idx].Replicas))) + } + + time.Sleep(10 * time.Millisecond) + g.Expect(atomic.LoadInt64(&serverEvents)).To(Equal(test.expectedServerEvents)) + g.Expect(atomic.LoadInt64(&modelEvents)).To(Equal(test.expectedModelEvents)) + }) + } +} + +func TestRemoveServerReplica(t *testing.T) { + g := NewGomegaWithT(t) + ctx := context.Background() + + type test struct { + name string + models []*db.Model + servers []*db.Server + serverName string + replicaIdx int + serverExists bool + modelsReturned int + } + + tests := []test{ + { + name: "ReplicaRemovedButNotDeleted", + models: []*db.Model{}, + servers: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: { + LoadedModels: []*db.ModelVersionID{{Name: "model1", Version: 1}}, + }, + 1: {}, + }, + ExpectedReplicas: 2, + Shared: true, + }, + }, + serverName: "server1", + replicaIdx: 0, + serverExists: true, + modelsReturned: 0, // no models really defined in store + }, + { + name: "ReplicaRemovedAndDeleted", + models: []*db.Model{ + { + Name: "model1", + Versions: []*db.ModelVersion{ + { + Version: 1, + Replicas: make(map[int32]*db.ReplicaStatus), + }, + }, + }, + }, + servers: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: { + LoadedModels: []*db.ModelVersionID{{Name: "model1", Version: 1}}, + }, + 1: {}, + }, + ExpectedReplicas: -1, + Shared: true, + }, + }, + serverName: "server1", + replicaIdx: 0, + serverExists: true, + modelsReturned: 1, + }, + { + name: "ReplicaRemovedAndServerDeleted", + models: []*db.Model{ + { + Name: "model1", + Versions: []*db.ModelVersion{ + { + Version: 1, + Replicas: make(map[int32]*db.ReplicaStatus), + }, + }, + }, + }, + servers: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: { + LoadedModels: []*db.ModelVersionID{{Name: "model1", Version: 1}}, + }, + }, + ExpectedReplicas: 0, + Shared: true, + }, + }, + serverName: "server1", + replicaIdx: 0, + serverExists: false, + modelsReturned: 1, + }, + { + name: "ReplicaRemovedAndServerDeleted but no model version in store", + models: []*db.Model{}, + servers: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: { + LoadedModels: []*db.ModelVersionID{{Name: "model1", Version: 1}}, + }, + }, + ExpectedReplicas: 0, + Shared: true, + }, + }, + serverName: "server1", + replicaIdx: 0, + serverExists: false, + modelsReturned: 0, + }, + { + name: "ReplicaRemovedAndDeleted - loading models", + models: []*db.Model{ + { + Name: "model1", + Versions: []*db.ModelVersion{ + { + Version: 1, + Replicas: make(map[int32]*db.ReplicaStatus), + }, + }, + }, + { + Name: "model2", + Versions: []*db.ModelVersion{ + { + Version: 1, + Replicas: make(map[int32]*db.ReplicaStatus), + }, + }, + }, + }, + servers: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: { + LoadedModels: []*db.ModelVersionID{ + {Name: "model1", Version: 1}, + {Name: "model2", Version: 1}, + }, + }, + 1: {}, + }, + ExpectedReplicas: -1, + Shared: true, + }, + }, + serverName: "server1", + replicaIdx: 0, + serverExists: true, + modelsReturned: 2, + }, + { + name: "ReplicaRemovedAndDeleted - non latest models", + models: []*db.Model{ + { + Name: "model1", + Versions: []*db.ModelVersion{ + { + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Loaded}, + }, + State: &db.ModelStatus{}, + ModelDefn: &pb.Model{Meta: &pb.MetaData{Name: "model1"}}, + }, + { + Version: 2, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_LoadFailed}, + }, + State: &db.ModelStatus{}, + ModelDefn: &pb.Model{Meta: &pb.MetaData{Name: "model1"}}, + }, + }, + }, + }, + servers: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: { + LoadedModels: []*db.ModelVersionID{{Name: "model1", Version: 1}}, + }, + }, + ExpectedReplicas: -1, + Shared: true, + }, + }, + serverName: "server1", + replicaIdx: 0, + serverExists: false, + modelsReturned: 0, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + logger := log.New() + eventHub, err := coordinator.NewEventHub(logger) + g.Expect(err).To(BeNil()) + + // Create storage instances + modelStorage := NewInMemoryStorage[*db.Model]() + serverStorage := NewInMemoryStorage[*db.Server]() + + // Populate storage with test data + for _, model := range test.models { + err := modelStorage.Insert(ctx, model) + g.Expect(err).To(BeNil()) + } + for _, server := range test.servers { + err := serverStorage.Insert(ctx, server) g.Expect(err).To(BeNil()) - return model } + // Create MemoryStore with populated storage ms := NewModelServerStore(logger, modelStorage, serverStorage, eventHub) - ms.addModelVersionIfNotExists(test.modelVersion) - modelName := test.modelVersion.GetModel().GetMeta().GetName() - g.Expect(getModel(modelName).GetVersions()).To(Equal(test.expected)) - g.Expect(getModel(modelName).Latest().Version).To(Equal(test.latest)) + + models, err := ms.RemoveServerReplica(test.serverName, test.replicaIdx) + g.Expect(err).To(BeNil()) + g.Expect(test.modelsReturned).To(Equal(len(models))) + server, _, err := ms.GetServer(test.serverName, false) + if test.serverExists { + g.Expect(err).To(BeNil()) + g.Expect(server).ToNot(BeNil()) + } else { + g.Expect(err).ToNot(BeNil()) + g.Expect(server).To(BeNil()) + } }) } } -// -//func TestAddServerReplica(t *testing.T) { -// g := NewGomegaWithT(t) -// ctx := context.Background() -// -// type test struct { -// name string -// models []*db.Model -// servers []*db.Server -// req *agent.AgentSubscribeRequest -// expectedSnapshot []*ServerSnapshot -// expectedModelEvents int64 -// expectedServerEvents int64 -// } -// -// tests := []test{ -// { -// name: "AddServerReplica - existing server", -// models: []*db.Model{}, -// servers: []*db.Server{ -// { -// Name: "server1", -// Replicas: map[int32]*db.ServerReplica{ -// 0: {}, -// 1: {}, -// }, -// ExpectedReplicas: 3, -// Shared: true, -// }, -// }, -// req: &agent.AgentSubscribeRequest{ -// ServerName: "server1", -// ReplicaIdx: 2, -// Shared: true, -// }, -// expectedSnapshot: []*ServerSnapshot{ -// { -// Name: "server1", -// Replicas: map[int]*ServerReplica{ -// 0: {}, -// 1: {}, -// 2: {}, -// }, -// ExpectedReplicas: 3, -// Shared: true, -// }, -// }, -// expectedModelEvents: 0, -// expectedServerEvents: 1, -// }, -// { -// name: "AddServerReplica - new server", -// models: []*db.Model{}, -// servers: []*db.Server{}, -// req: &agent.AgentSubscribeRequest{ -// ServerName: "server1", -// ReplicaIdx: 0, -// Shared: true, -// }, -// expectedSnapshot: []*ServerSnapshot{ -// { -// Name: "server1", -// Replicas: map[int]*ServerReplica{ -// 0: {}, -// }, -// ExpectedReplicas: -1, // expected replicas is not set -// Shared: true, -// }, -// }, -// expectedModelEvents: 0, -// expectedServerEvents: 1, -// }, -// { -// name: "AddServerReplica - with loaded models", -// models: []*db.Model{}, -// servers: []*db.Server{}, -// req: &agent.AgentSubscribeRequest{ -// ServerName: "server1", -// ReplicaIdx: 0, -// Shared: true, -// LoadedModels: []*agent.ModelVersion{ -// { -// Model: &pb.Model{ -// Meta: &pb.MetaData{Name: "model1"}, -// ModelSpec: &pb.ModelSpec{}, -// }, -// Version: 1, -// }, -// { -// Model: &pb.Model{ -// Meta: &pb.MetaData{Name: "model2"}, -// ModelSpec: &pb.ModelSpec{}, -// }, -// Version: 1, -// }, -// }, -// }, -// expectedSnapshot: []*ServerSnapshot{ -// { -// Name: "server1", -// Replicas: map[int]*ServerReplica{ -// 0: {}, -// }, -// ExpectedReplicas: -1, // expected replicas is not set -// Shared: true, -// }, -// }, -// expectedModelEvents: 2, -// expectedServerEvents: 1, -// }, -// } -// -// for _, test := range tests { -// t.Run(test.name, func(t *testing.T) { -// logger := log.New() -// eventHub, err := coordinator.NewEventHub(logger) -// g.Expect(err).To(BeNil()) -// -// // Create storage instances -// modelStorage := NewInMemoryStorage[*db.Model]() -// serverStorage := NewInMemoryStorage[*db.Server]() -// -// // Populate storage with test data -// for _, model := range test.models { -// err := modelStorage.Insert(ctx, model) -// g.Expect(err).To(BeNil()) -// } -// for _, server := range test.servers { -// err := serverStorage.Insert(ctx, server) -// g.Expect(err).To(BeNil()) -// } -// -// // Create MemoryStore with populated storage -// ms := NewModelServerStore(logger, modelStorage, serverStorage, eventHub) -// -// // register a callback to check if the event is triggered -// serverEvents := int64(0) -// eventHub.RegisterServerEventHandler( -// "handler-server", -// 10, -// logger, -// func(event coordinator.ServerEventMsg) { atomic.AddInt64(&serverEvents, 1) }, -// ) -// -// modelEvents := int64(0) -// eventHub.RegisterModelEventHandler( -// "handler-model", -// 10, -// logger, -// func(event coordinator.ModelEventMsg) { atomic.AddInt64(&modelEvents, 1) }, -// ) -// -// err = ms.AddServerReplica(test.req) -// g.Expect(err).To(BeNil()) -// actualSnapshot, err := ms.GetServers(true, false) -// g.Expect(err).To(BeNil()) -// for idx, server := range actualSnapshot { -// g.Expect(server.Name).To(Equal(test.expectedSnapshot[idx].Name)) -// g.Expect(server.Shared).To(Equal(test.expectedSnapshot[idx].Shared)) -// g.Expect(server.ExpectedReplicas).To(Equal(test.expectedSnapshot[idx].ExpectedReplicas)) -// g.Expect(len(server.Replicas)).To(Equal(len(test.expectedSnapshot[idx].Replicas))) -// } -// -// time.Sleep(10 * time.Millisecond) -// g.Expect(atomic.LoadInt64(&serverEvents)).To(Equal(test.expectedServerEvents)) -// g.Expect(atomic.LoadInt64(&modelEvents)).To(Equal(test.expectedModelEvents)) -// }) -// } -//} -// -//func TestRemoveServerReplica(t *testing.T) { -// g := NewGomegaWithT(t) -// ctx := context.Background() -// -// type test struct { -// name string -// models []*db.Model -// servers []*db.Server -// serverName string -// replicaIdx int -// serverExists bool -// modelsReturned int -// } -// -// tests := []test{ -// { -// name: "ReplicaRemovedButNotDeleted", -// models: []*db.Model{}, -// servers: []*db.Server{ -// { -// Name: "server1", -// Replicas: map[int32]*db.ServerReplica{ -// 0: { -// LoadedModels: []*db.ModelVersionID{{Name: "model1", Version: 1}}, -// }, -// 1: {}, -// }, -// ExpectedReplicas: 2, -// Shared: true, -// }, -// }, -// serverName: "server1", -// replicaIdx: 0, -// serverExists: true, -// modelsReturned: 0, // no models really defined in store -// }, -// { -// name: "ReplicaRemovedAndDeleted", -// models: []*db.Model{ -// { -// Name: "model1", -// Versions: []*db.ModelVersion{ -// { -// Version: 1, -// Replicas: make(map[int32]*db.ReplicaStatus), -// }, -// }, -// }, -// }, -// servers: []*db.Server{ -// { -// Name: "server1", -// Replicas: map[int32]*db.ServerReplica{ -// 0: { -// LoadedModels: []*db.ModelVersionID{{Name: "model1", Version: 1}}, -// }, -// 1: {}, -// }, -// ExpectedReplicas: -1, -// Shared: true, -// }, -// }, -// serverName: "server1", -// replicaIdx: 0, -// serverExists: true, -// modelsReturned: 1, -// }, -// { -// name: "ReplicaRemovedAndServerDeleted", -// models: []*db.Model{ -// { -// Name: "model1", -// Versions: []*db.ModelVersion{ -// { -// Version: 1, -// Replicas: make(map[int32]*db.ReplicaStatus), -// }, -// }, -// }, -// }, -// servers: []*db.Server{ -// { -// Name: "server1", -// Replicas: map[int32]*db.ServerReplica{ -// 0: { -// LoadedModels: []*db.ModelVersionID{{Name: "model1", Version: 1}}, -// }, -// }, -// ExpectedReplicas: 0, -// Shared: true, -// }, -// }, -// serverName: "server1", -// replicaIdx: 0, -// serverExists: false, -// modelsReturned: 1, -// }, -// { -// name: "ReplicaRemovedAndServerDeleted but no model version in store", -// models: []*db.Model{}, -// servers: []*db.Server{ -// { -// Name: "server1", -// Replicas: map[int32]*db.ServerReplica{ -// 0: { -// LoadedModels: []*db.ModelVersionID{{Name: "model1", Version: 1}}, -// }, -// }, -// ExpectedReplicas: 0, -// Shared: true, -// }, -// }, -// serverName: "server1", -// replicaIdx: 0, -// serverExists: false, -// modelsReturned: 0, -// }, -// { -// name: "ReplicaRemovedAndDeleted - loading models", -// models: []*db.Model{ -// { -// Name: "model1", -// Versions: []*db.ModelVersion{ -// { -// Version: 1, -// Replicas: make(map[int32]*db.ReplicaStatus), -// }, -// }, -// }, -// { -// Name: "model2", -// Versions: []*db.ModelVersion{ -// { -// Version: 1, -// Replicas: make(map[int32]*db.ReplicaStatus), -// }, -// }, -// }, -// }, -// servers: []*db.Server{ -// { -// Name: "server1", -// Replicas: map[int32]*db.ServerReplica{ -// 0: { -// LoadedModels: []*db.ModelVersionID{ -// {Name: "model1", Version: 1}, -// {Name: "model2", Version: 1}, -// }, -// }, -// 1: {}, -// }, -// ExpectedReplicas: -1, -// Shared: true, -// }, -// }, -// serverName: "server1", -// replicaIdx: 0, -// serverExists: true, -// modelsReturned: 2, -// }, -// { -// name: "ReplicaRemovedAndDeleted - non latest models", -// models: []*db.Model{ -// { -// Name: "model1", -// Versions: []*db.ModelVersion{ -// { -// Version: 1, -// Replicas: map[int32]*db.ReplicaStatus{ -// 0: {State: db.ModelReplicaState_Loaded}, -// }, -// }, -// { -// Version: 2, -// Replicas: map[int32]*db.ReplicaStatus{ -// 0: {State: db.ModelReplicaState_LoadFailed}, -// }, -// }, -// }, -// }, -// }, -// servers: []*db.Server{ -// { -// Name: "server1", -// Replicas: map[int32]*db.ServerReplica{ -// 0: { -// LoadedModels: []*db.ModelVersionID{{Name: "model1", Version: 1}}, -// }, -// }, -// ExpectedReplicas: -1, -// Shared: true, -// }, -// }, -// serverName: "server1", -// replicaIdx: 0, -// serverExists: false, -// modelsReturned: 0, -// }, -// } -// -// for _, test := range tests { -// t.Run(test.name, func(t *testing.T) { -// logger := log.New() -// eventHub, err := coordinator.NewEventHub(logger) -// g.Expect(err).To(BeNil()) -// -// // Create storage instances -// modelStorage := NewInMemoryStorage[*db.Model]() -// serverStorage := NewInMemoryStorage[*db.Server]() -// -// // Populate storage with test data -// for _, model := range test.models { -// err := modelStorage.Insert(ctx, model) -// g.Expect(err).To(BeNil()) -// } -// for _, server := range test.servers { -// err := serverStorage.Insert(ctx, server) -// g.Expect(err).To(BeNil()) -// } -// -// // Create MemoryStore with populated storage -// ms := NewModelServerStore(logger, modelStorage, serverStorage, eventHub) -// -// models, err := ms.RemoveServerReplica(test.serverName, test.replicaIdx) -// g.Expect(err).To(BeNil()) -// g.Expect(test.modelsReturned).To(Equal(len(models))) -// server, _, err := ms.GetServer(test.serverName, false, true) -// if test.serverExists { -// g.Expect(err).To(BeNil()) -// g.Expect(server).ToNot(BeNil()) -// } else { -// g.Expect(err).ToNot(BeNil()) -// g.Expect(server).To(BeNil()) -// } -// }) -// } -//} -// -//func TestDrainServerReplica(t *testing.T) { -// g := NewGomegaWithT(t) -// ctx := context.Background() -// -// type test struct { -// name string -// models []*db.Model -// servers []*db.Server -// serverName string -// replicaIdx int -// modelsReturned []string -// } -// -// // if we have models returned check status is Draining -// tests := []test{ -// { -// name: "ReplicaSetDrainingNoModels", -// servers: []*db.Server{ -// { -// Name: "server1", -// Replicas: map[int32]*db.ServerReplica{ -// 0: {}, -// 1: {}, -// }, -// ExpectedReplicas: 2, -// Shared: true, -// }, -// }, -// serverName: "server1", -// replicaIdx: 0, -// modelsReturned: []string{}, -// }, -// { -// name: "ReplicaSetDrainingWithLoadedModels", -// models: []*db.Model{ -// { -// Name: "model1", -// Versions: []*db.ModelVersion{ -// { -// Version: 1, -// Replicas: map[int32]*db.ReplicaStatus{0: {State: db.ModelReplicaState_Loaded}}, -// }, -// }, -// }, -// }, -// servers: []*db.Server{ -// { -// Name: "server1", -// Replicas: map[int32]*db.ServerReplica{ -// 0: {LoadedModels: []*db.ModelVersionID{{Name: "model1", Version: 1}}}, -// 1: {}, -// }, -// ExpectedReplicas: -1, -// Shared: true, -// }, -// }, -// serverName: "server1", -// replicaIdx: 0, -// modelsReturned: []string{"model1"}, -// }, -// { -// name: "ReplicaSetDrainingWithLoadedAndLoadingModels", -// models: []*db.Model{ -// { -// Name: "model1", -// Versions: []*db.ModelVersion{ -// { -// Version: 1, -// Replicas: map[int32]*db.ReplicaStatus{ -// 0: {State: db.ModelReplicaState_Loaded}}, -// }, -// }, -// }, -// { -// Name: "model2", -// Versions: []*db.ModelVersion{ -// { -// Version: 1, -// Replicas: map[int32]*db.ReplicaStatus{ -// 0: {State: db.ModelReplicaState_Loading}}, -// }, -// }, -// }, -// }, -// servers: []*db.Server{ -// { -// Name: "server1", -// Replicas: map[int32]*db.ServerReplica{ -// 0: { -// LoadedModels: []*db.ModelVersionID{ -// {Name: "model1", Version: 1}, -// }, -// LoadingModels: []*db.ModelVersionID{ -// {Name: "model2", Version: 1}, -// }, -// }, -// 1: {}, -// }, -// ExpectedReplicas: -1, -// Shared: true, -// }, -// }, -// serverName: "server1", -// replicaIdx: 0, -// modelsReturned: []string{"model1", "model2"}, -// }, -// } -// -// for _, test := range tests { -// t.Run(test.name, func(t *testing.T) { -// logger := log.New() -// eventHub, err := coordinator.NewEventHub(logger) -// g.Expect(err).To(BeNil()) -// -// // Create storage instances -// modelStorage := NewInMemoryStorage[*db.Model]() -// serverStorage := NewInMemoryStorage[*db.Server]() -// -// // Populate storage with test data -// for _, model := range test.models { -// err := modelStorage.Insert(ctx, model) -// g.Expect(err).To(BeNil()) -// } -// for _, server := range test.servers { -// err := serverStorage.Insert(ctx, server) -// g.Expect(err).To(BeNil()) -// } -// -// // Create MemoryStore with populated storage -// ms := NewModelServerStore(logger, modelStorage, serverStorage, eventHub) -// -// models, err := ms.DrainServerReplica(test.serverName, test.replicaIdx) -// g.Expect(err).To(BeNil()) -// g.Expect(test.modelsReturned).To(Equal(models)) -// server, _, err := ms.GetServer(test.serverName, false, true) -// g.Expect(err).To(BeNil()) -// g.Expect(server).ToNot(BeNil()) -// g.Expect(server.Replicas[int32(test.replicaIdx)].GetIsDraining()).To(BeTrue()) -// -// if test.modelsReturned != nil { -// for _, model := range test.modelsReturned { -// dbModel, _ := ms.GetModel(model) -// state := dbModel.Latest().GetModelReplicaState(test.replicaIdx) -// g.Expect(state).To(Equal(db.ModelReplicaState_Draining)) -// } -// } -// }) -// } -//} +func TestDrainServerReplica(t *testing.T) { + g := NewGomegaWithT(t) + ctx := context.Background() + + type test struct { + name string + models []*db.Model + servers []*db.Server + serverName string + replicaIdx int + modelsReturned []string + } + + // if we have models returned check status is Draining + tests := []test{ + { + name: "ReplicaSetDrainingNoModels", + servers: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: {}, + 1: {}, + }, + ExpectedReplicas: 2, + Shared: true, + }, + }, + serverName: "server1", + replicaIdx: 0, + modelsReturned: []string{}, + }, + { + name: "ReplicaSetDrainingWithLoadedModels", + models: []*db.Model{ + { + Name: "model1", + Versions: []*db.ModelVersion{ + { + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{0: {State: db.ModelReplicaState_Loaded}}, + }, + }, + }, + }, + servers: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: {LoadedModels: []*db.ModelVersionID{{Name: "model1", Version: 1}}}, + 1: {}, + }, + ExpectedReplicas: -1, + Shared: true, + }, + }, + serverName: "server1", + replicaIdx: 0, + modelsReturned: []string{"model1"}, + }, + { + name: "ReplicaSetDrainingWithLoadedAndLoadingModels", + models: []*db.Model{ + { + Name: "model1", + Versions: []*db.ModelVersion{ + { + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Loaded}}, + }, + }, + }, + { + Name: "model2", + Versions: []*db.ModelVersion{ + { + Version: 1, + Replicas: map[int32]*db.ReplicaStatus{ + 0: {State: db.ModelReplicaState_Loading}}, + }, + }, + }, + }, + servers: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: { + LoadedModels: []*db.ModelVersionID{ + {Name: "model1", Version: 1}, + }, + LoadingModels: []*db.ModelVersionID{ + {Name: "model2", Version: 1}, + }, + }, + 1: {}, + }, + ExpectedReplicas: -1, + Shared: true, + }, + }, + serverName: "server1", + replicaIdx: 0, + modelsReturned: []string{"model1", "model2"}, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + logger := log.New() + eventHub, err := coordinator.NewEventHub(logger) + g.Expect(err).To(BeNil()) + + // Create storage instances + modelStorage := NewInMemoryStorage[*db.Model]() + serverStorage := NewInMemoryStorage[*db.Server]() + + // Populate storage with test data + for _, model := range test.models { + err := modelStorage.Insert(ctx, model) + g.Expect(err).To(BeNil()) + } + for _, server := range test.servers { + err := serverStorage.Insert(ctx, server) + g.Expect(err).To(BeNil()) + } + + // Create MemoryStore with populated storage + ms := NewModelServerStore(logger, modelStorage, serverStorage, eventHub) + + models, err := ms.DrainServerReplica(test.serverName, test.replicaIdx) + g.Expect(err).To(BeNil()) + g.Expect(test.modelsReturned).To(Equal(models)) + server, _, err := ms.GetServer(test.serverName, false) + g.Expect(err).To(BeNil()) + g.Expect(server).ToNot(BeNil()) + g.Expect(server.Replicas[int32(test.replicaIdx)].GetIsDraining()).To(BeTrue()) + + if test.modelsReturned != nil { + for _, model := range test.modelsReturned { + dbModel, _ := ms.GetModel(model) + state := dbModel.Latest().GetModelReplicaState(test.replicaIdx) + g.Expect(state).To(Equal(db.ModelReplicaState_Draining)) + } + } + }) + } +} From 2bd3f62316004d6f470fd427d2d6184473a830c6 Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Mon, 12 Jan 2026 13:13:42 +0000 Subject: [PATCH 32/50] fix expierment tests --- scheduler/pkg/store/experiment/state_test.go | 72 +++- scheduler/pkg/store/experiment/store_test.go | 379 ++++++++----------- scheduler/pkg/store/experiment/utils_test.go | 278 +++++++------- scheduler/pkg/store/mock/store.go | 113 +++--- 4 files changed, 413 insertions(+), 429 deletions(-) diff --git a/scheduler/pkg/store/experiment/state_test.go b/scheduler/pkg/store/experiment/state_test.go index e8cda540f1..21e2a633a2 100644 --- a/scheduler/pkg/store/experiment/state_test.go +++ b/scheduler/pkg/store/experiment/state_test.go @@ -13,12 +13,14 @@ import ( "testing" . "github.com/onsi/gomega" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/mock" "github.com/sirupsen/logrus" + "go.uber.org/mock/gomock" "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" - "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/pipeline" ) @@ -597,11 +599,12 @@ func TestSetCandidateAndMirrorModelReadiness(t *testing.T) { g := NewGomegaWithT(t) type test struct { - name string - experiment *Experiment - modelStates map[string]store.ModelState + name string + experiment *Experiment + expectedCandidatesReady bool expectedMirrorReady bool + setupMock func(m *mock.MockModelServerAPI) } tests := []test{ @@ -615,7 +618,13 @@ func TestSetCandidateAndMirrorModelReadiness(t *testing.T) { }, }, }, - modelStates: map[string]store.ModelState{"model1": store.ModelAvailable}, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().GetModel("model1").Return(&db.Model{Name: "model1", Versions: []*db.ModelVersion{ + { + State: &db.ModelStatus{State: db.ModelState_ModelAvailable}, + }, + }}, nil).MinTimes(1) + }, expectedCandidatesReady: true, expectedMirrorReady: true, }, @@ -629,7 +638,13 @@ func TestSetCandidateAndMirrorModelReadiness(t *testing.T) { }, }, }, - modelStates: map[string]store.ModelState{"model1": store.ModelFailed}, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().GetModel("model1").Return(&db.Model{Name: "model1", Versions: []*db.ModelVersion{ + { + State: &db.ModelStatus{State: db.ModelState_ModelFailed}, + }, + }}, nil).MinTimes(1) + }, expectedCandidatesReady: false, expectedMirrorReady: true, }, @@ -646,7 +661,18 @@ func TestSetCandidateAndMirrorModelReadiness(t *testing.T) { }, }, }, - modelStates: map[string]store.ModelState{"model1": store.ModelAvailable}, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().GetModel("model1").Return(&db.Model{Name: "model1", Versions: []*db.ModelVersion{ + { + State: &db.ModelStatus{State: db.ModelState_ModelAvailable}, + }, + }}, nil).MinTimes(1) + m.EXPECT().GetModel("model2").Return(&db.Model{Name: "model2", Versions: []*db.ModelVersion{ + { + State: &db.ModelStatus{}, + }, + }}, nil).MinTimes(1) + }, expectedCandidatesReady: false, expectedMirrorReady: true, }, @@ -663,7 +689,18 @@ func TestSetCandidateAndMirrorModelReadiness(t *testing.T) { }, }, }, - modelStates: map[string]store.ModelState{"model1": store.ModelAvailable, "model2": store.ModelAvailable}, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().GetModel("model1").Return(&db.Model{Name: "model1", Versions: []*db.ModelVersion{ + { + State: &db.ModelStatus{State: db.ModelState_ModelAvailable}, + }, + }}, nil).MinTimes(1) + m.EXPECT().GetModel("model2").Return(&db.Model{Name: "model2", Versions: []*db.ModelVersion{ + { + State: &db.ModelStatus{State: db.ModelState_ModelAvailable}, + }, + }}, nil).MinTimes(1) + }, expectedCandidatesReady: true, expectedMirrorReady: true, }, @@ -680,18 +717,33 @@ func TestSetCandidateAndMirrorModelReadiness(t *testing.T) { Name: "model2", }, }, - modelStates: map[string]store.ModelState{"model1": store.ModelAvailable, "model2": store.ModelAvailable}, expectedCandidatesReady: true, expectedMirrorReady: true, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().GetModel("model1").Return(&db.Model{Name: "model1", Versions: []*db.ModelVersion{ + { + State: &db.ModelStatus{State: db.ModelState_ModelAvailable}, + }, + }}, nil).MinTimes(1) + m.EXPECT().GetModel("model2").Return(&db.Model{Name: "model2", Versions: []*db.ModelVersion{ + { + State: &db.ModelStatus{State: db.ModelState_ModelAvailable}, + }, + }}, nil).MinTimes(1) + }, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + mockServerMock := mock.NewMockModelServerAPI(ctrl) + test.setupMock(mockServerMock) + logger := logrus.New() eventHub, err := coordinator.NewEventHub(logger) g.Expect(err).To(BeNil()) - server := NewExperimentServer(logger, eventHub, fakeModelStore{status: test.modelStates}, nil) + server := NewExperimentServer(logger, eventHub, mockServerMock, nil) err = server.StartExperiment(test.experiment) g.Expect(err).To(BeNil()) diff --git a/scheduler/pkg/store/experiment/store_test.go b/scheduler/pkg/store/experiment/store_test.go index 022419c60e..cdf35907fc 100644 --- a/scheduler/pkg/store/experiment/store_test.go +++ b/scheduler/pkg/store/experiment/store_test.go @@ -17,11 +17,9 @@ import ( "github.com/google/go-cmp/cmp" . "github.com/onsi/gomega" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/sirupsen/logrus" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/agent" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" - "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" ) @@ -193,7 +191,12 @@ func TestStartExperiment(t *testing.T) { logger := logrus.New() eventHub, err := coordinator.NewEventHub(logger) g.Expect(err).To(BeNil()) - server := NewExperimentServer(logger, eventHub, fakeModelStore{}, fakePipelineStore{}) + + modelStorage := store.NewInMemoryStorage[*db.Model]() + serverStorage := store.NewInMemoryStorage[*db.Server]() + ms := store.NewModelServerStore(logger, modelStorage, serverStorage, eventHub) + + server := NewExperimentServer(logger, eventHub, ms, fakePipelineStore{}) // init db _ = server.InitialiseOrRestoreDB(path, 10) for _, ea := range test.experiments { @@ -448,230 +451,144 @@ func TestRestoreExperiments(t *testing.T) { } } -type fakeModelStore struct { - status map[string]store.ModelState -} - -var _ store.ModelServerAPI = (*fakeModelStore)(nil) - -func (f fakeModelStore) UpdateModel(config *scheduler.LoadModelRequest) error { - panic("implement me") -} - -func (f fakeModelStore) GetModel(key string) (*store.ModelSnapshot, error) { - return &store.ModelSnapshot{ - Name: key, - Versions: []*store.ModelVersion{ - store.NewModelVersion(nil, 1, "server", nil, false, f.status[key]), - }, - }, nil -} - -func (f fakeModelStore) GetModels() ([]*store.ModelSnapshot, error) { - panic("implement me") -} - -func (f fakeModelStore) LockModel(modelId string) { - panic("implement me") -} - -func (f fakeModelStore) UnlockModel(modelId string) { - panic("implement me") -} - -func (f fakeModelStore) RemoveModel(req *scheduler.UnloadModelRequest) error { - panic("implement me") -} - -func (f fakeModelStore) GetServers(shallow bool, modelDetails bool) ([]*store.ServerSnapshot, error) { - panic("implement me") -} - -func (f fakeModelStore) GetServer(serverKey string, shallow bool, modelDetails bool) (*store.ServerSnapshot, error) { - panic("implement me") -} - -func (f fakeModelStore) UpdateLoadedModels(modelKey string, version uint32, serverKey string, replicas []*store.ServerReplica) error { - panic("implement me") -} - -func (f fakeModelStore) UnloadVersionModels(modelKey string, version uint32) (bool, error) { - panic("implement me") -} - -func (f fakeModelStore) UnloadModelGwVersionModels(modelKey string, version uint32) (bool, error) { - panic("implement me") -} - -func (f fakeModelStore) UpdateModelState(modelKey string, version uint32, serverKey string, replicaIdx int, availableMemory *uint64, expectedState, desiredState store.ModelReplicaState, reason string, runtimeInfo *scheduler.ModelRuntimeInfo) error { - panic("implement me") -} - -func (f fakeModelStore) AddServerReplica(request *agent.AgentSubscribeRequest) error { - panic("implement me") -} - -func (f fakeModelStore) ServerNotify(request *scheduler.ServerNotify) error { - panic("implement me") -} - -func (f fakeModelStore) RemoveServerReplica(serverName string, replicaIdx int) ([]string, error) { - panic("implement me") -} - -func (f fakeModelStore) DrainServerReplica(serverName string, replicaIdx int) ([]string, error) { - panic("implement me") -} - -func (f fakeModelStore) FailedScheduling(modelID string, version uint32, reason string, reset bool) error { - panic("implement me") -} - -func (f fakeModelStore) GetAllModels() []string { - panic("implement me") -} - -func (f fakeModelStore) SetModelGwModelState(name string, versionNumber uint32, status store.ModelState, reason string, source string) error { - panic("implement me") -} - -func TestHandleModelEvents(t *testing.T) { - g := NewGomegaWithT(t) - - type test struct { - name string - experiment *Experiment - modelStates map[string]store.ModelState - modelEventMsgs []coordinator.ModelEventMsg - expectedCandidatesReady bool - expectedMirrorReady bool - } - - tests := []test{ - { - name: "candidate ready as model is ready", - experiment: &Experiment{ - Name: "a", - Candidates: []*Candidate{ - { - Name: "model1", - }, - }, - }, - modelStates: map[string]store.ModelState{"model1": store.ModelAvailable}, - modelEventMsgs: []coordinator.ModelEventMsg{ - { - ModelName: "model1", - }, - }, - expectedCandidatesReady: true, - expectedMirrorReady: true, - }, - { - name: "candidates not ready as model is not ready", - experiment: &Experiment{ - Name: "a", - Candidates: []*Candidate{ - { - Name: "model1", - }, - }, - }, - modelStates: map[string]store.ModelState{"model1": store.ModelFailed}, - modelEventMsgs: []coordinator.ModelEventMsg{ - { - ModelName: "model1", - }, - }, - expectedCandidatesReady: false, - expectedMirrorReady: true, - }, - { - name: "multiple candidates only one ready", - experiment: &Experiment{ - Name: "a", - Candidates: []*Candidate{ - { - Name: "model1", - }, - { - Name: "model2", - }, - }, - }, - modelStates: map[string]store.ModelState{"model1": store.ModelAvailable}, - modelEventMsgs: []coordinator.ModelEventMsg{ - { - ModelName: "model1", - }, - }, - expectedCandidatesReady: false, - expectedMirrorReady: true, - }, - { - name: "multiple candidates all ready", - experiment: &Experiment{ - Name: "a", - Candidates: []*Candidate{ - { - Name: "model1", - }, - { - Name: "model2", - }, - }, - }, - modelStates: map[string]store.ModelState{"model1": store.ModelAvailable, "model2": store.ModelAvailable}, - modelEventMsgs: []coordinator.ModelEventMsg{ - { - ModelName: "model1", - }, - { - ModelName: "model2", - }, - }, - expectedCandidatesReady: true, - expectedMirrorReady: true, - }, - { - name: "mirror and candidate ready as model is ready", - experiment: &Experiment{ - Name: "a", - Candidates: []*Candidate{ - { - Name: "model1", - }, - }, - Mirror: &Mirror{ - Name: "model2", - }, - }, - modelStates: map[string]store.ModelState{"model1": store.ModelAvailable, "model2": store.ModelAvailable}, - modelEventMsgs: []coordinator.ModelEventMsg{ - { - ModelName: "model1", - }, - }, - expectedCandidatesReady: true, - expectedMirrorReady: true, - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - logger := logrus.New() - eventHub, err := coordinator.NewEventHub(logger) - g.Expect(err).To(BeNil()) - server := NewExperimentServer(logger, eventHub, fakeModelStore{status: test.modelStates}, fakePipelineStore{}) - err = server.StartExperiment(test.experiment) - g.Expect(err).To(BeNil()) - for _, event := range test.modelEventMsgs { - server.handleModelEvents(event) - } - exp, err := server.GetExperiment(test.experiment.Name) - g.Expect(err).To(BeNil()) - g.Expect(exp.AreCandidatesReady()).To(Equal(test.expectedCandidatesReady)) - g.Expect(exp.IsMirrorReady()).To(Equal(test.expectedMirrorReady)) - }) - } -} +// +//func TestHandleModelEvents(t *testing.T) { +// g := NewGomegaWithT(t) +// +// type test struct { +// name string +// experiment *Experiment +// modelStates map[string]store.ModelState +// modelEventMsgs []coordinator.ModelEventMsg +// expectedCandidatesReady bool +// expectedMirrorReady bool +// } +// +// tests := []test{ +// { +// name: "candidate ready as model is ready", +// experiment: &Experiment{ +// Name: "a", +// Candidates: []*Candidate{ +// { +// Name: "model1", +// }, +// }, +// }, +// modelStates: map[string]store.ModelState{"model1": store.ModelAvailable}, +// modelEventMsgs: []coordinator.ModelEventMsg{ +// { +// ModelName: "model1", +// }, +// }, +// expectedCandidatesReady: true, +// expectedMirrorReady: true, +// }, +// { +// name: "candidates not ready as model is not ready", +// experiment: &Experiment{ +// Name: "a", +// Candidates: []*Candidate{ +// { +// Name: "model1", +// }, +// }, +// }, +// modelStates: map[string]store.ModelState{"model1": store.ModelFailed}, +// modelEventMsgs: []coordinator.ModelEventMsg{ +// { +// ModelName: "model1", +// }, +// }, +// expectedCandidatesReady: false, +// expectedMirrorReady: true, +// }, +// { +// name: "multiple candidates only one ready", +// experiment: &Experiment{ +// Name: "a", +// Candidates: []*Candidate{ +// { +// Name: "model1", +// }, +// { +// Name: "model2", +// }, +// }, +// }, +// modelStates: map[string]store.ModelState{"model1": store.ModelAvailable}, +// modelEventMsgs: []coordinator.ModelEventMsg{ +// { +// ModelName: "model1", +// }, +// }, +// expectedCandidatesReady: false, +// expectedMirrorReady: true, +// }, +// { +// name: "multiple candidates all ready", +// experiment: &Experiment{ +// Name: "a", +// Candidates: []*Candidate{ +// { +// Name: "model1", +// }, +// { +// Name: "model2", +// }, +// }, +// }, +// modelStates: map[string]store.ModelState{"model1": store.ModelAvailable, "model2": store.ModelAvailable}, +// modelEventMsgs: []coordinator.ModelEventMsg{ +// { +// ModelName: "model1", +// }, +// { +// ModelName: "model2", +// }, +// }, +// expectedCandidatesReady: true, +// expectedMirrorReady: true, +// }, +// { +// name: "mirror and candidate ready as model is ready", +// experiment: &Experiment{ +// Name: "a", +// Candidates: []*Candidate{ +// { +// Name: "model1", +// }, +// }, +// Mirror: &Mirror{ +// Name: "model2", +// }, +// }, +// modelStates: map[string]store.ModelState{"model1": store.ModelAvailable, "model2": store.ModelAvailable}, +// modelEventMsgs: []coordinator.ModelEventMsg{ +// { +// ModelName: "model1", +// }, +// }, +// expectedCandidatesReady: true, +// expectedMirrorReady: true, +// }, +// } +// +// for _, test := range tests { +// t.Run(test.name, func(t *testing.T) { +// logger := logrus.New() +// eventHub, err := coordinator.NewEventHub(logger) +// g.Expect(err).To(BeNil()) +// server := NewExperimentServer(logger, eventHub, fakeModelStore{status: test.modelStates}, fakePipelineStore{}) +// err = server.StartExperiment(test.experiment) +// g.Expect(err).To(BeNil()) +// for _, event := range test.modelEventMsgs { +// server.handleModelEvents(event) +// } +// exp, err := server.GetExperiment(test.experiment.Name) +// g.Expect(err).To(BeNil()) +// g.Expect(exp.AreCandidatesReady()).To(Equal(test.expectedCandidatesReady)) +// g.Expect(exp.IsMirrorReady()).To(Equal(test.expectedMirrorReady)) +// }) +// } +//} diff --git a/scheduler/pkg/store/experiment/utils_test.go b/scheduler/pkg/store/experiment/utils_test.go index 4f59c45911..39e39407ac 100644 --- a/scheduler/pkg/store/experiment/utils_test.go +++ b/scheduler/pkg/store/experiment/utils_test.go @@ -13,7 +13,6 @@ import ( "testing" . "github.com/onsi/gomega" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" ) @@ -202,141 +201,142 @@ func TestCreateExperiment(t *testing.T) { } -func TestCreateExperimentFromSnapshot(t *testing.T) { - g := NewGomegaWithT(t) - type test struct { - name string - proto *scheduler.ExperimentSnapshot - expected *Experiment - } - - getStrPtr := func(val string) *string { return &val } - tests := []test{ - { - name: "experiment", - proto: &scheduler.ExperimentSnapshot{ - Experiment: &scheduler.Experiment{ - Name: "foo", - Default: getStrPtr("model1"), - Candidates: []*scheduler.ExperimentCandidate{ - { - Name: "model1", - Weight: 20, - }, - { - Name: "model3", - Weight: 20, - }, - }, - Mirror: &scheduler.ExperimentMirror{ - Name: "model4", - Percent: 80, - }, - Config: &scheduler.ExperimentConfig{ - StickySessions: true, - }, - KubernetesMeta: &scheduler.KubernetesMeta{ - Namespace: "default", - Generation: 1, - }, - }, - Deleted: false, - }, - expected: &Experiment{ - Name: "foo", - Active: false, - Deleted: false, - Default: getStrPtr("model1"), - ResourceType: ModelResourceType, - Candidates: []*Candidate{ - { - Name: "model1", - Weight: 20, - }, - { - Name: "model3", - Weight: 20, - }, - }, - Mirror: &Mirror{ - Name: "model4", - Percent: 80, - }, - Config: &Config{ - StickySessions: true, - }, - KubernetesMeta: &KubernetesMeta{ - Namespace: "default", - Generation: 1, - }, - }, - }, - { - name: "deleted experiment", - proto: &scheduler.ExperimentSnapshot{ - Experiment: &scheduler.Experiment{ - Name: "foo", - Default: getStrPtr("model1"), - Candidates: []*scheduler.ExperimentCandidate{ - { - Name: "model1", - Weight: 20, - }, - { - Name: "model3", - Weight: 20, - }, - }, - Mirror: &scheduler.ExperimentMirror{ - Name: "model4", - Percent: 80, - }, - Config: &scheduler.ExperimentConfig{ - StickySessions: true, - }, - KubernetesMeta: &scheduler.KubernetesMeta{ - Namespace: "default", - Generation: 1, - }, - }, - Deleted: true, - }, - expected: &Experiment{ - Name: "foo", - Active: false, - Deleted: true, - Default: getStrPtr("model1"), - ResourceType: ModelResourceType, - Candidates: []*Candidate{ - { - Name: "model1", - Weight: 20, - }, - { - Name: "model3", - Weight: 20, - }, - }, - Mirror: &Mirror{ - Name: "model4", - Percent: 80, - }, - Config: &Config{ - StickySessions: true, - }, - KubernetesMeta: &KubernetesMeta{ - Namespace: "default", - Generation: 1, - }, - }, - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - experiment := CreateExperimentFromSnapshot(test.proto) - g.Expect(experiment).To(Equal(test.expected)) - }) - } - -} +// +//func TestCreateExperimentFromSnapshot(t *testing.T) { +// g := NewGomegaWithT(t) +// type test struct { +// name string +// proto *db.ExperimentSnapshot +// expected *Experiment +// } +// +// getStrPtr := func(val string) *string { return &val } +// tests := []test{ +// { +// name: "experiment", +// proto: &scheduler.ExperimentSnapshot{ +// Experiment: &scheduler.Experiment{ +// Name: "foo", +// Default: getStrPtr("model1"), +// Candidates: []*scheduler.ExperimentCandidate{ +// { +// Name: "model1", +// Weight: 20, +// }, +// { +// Name: "model3", +// Weight: 20, +// }, +// }, +// Mirror: &scheduler.ExperimentMirror{ +// Name: "model4", +// Percent: 80, +// }, +// Config: &scheduler.ExperimentConfig{ +// StickySessions: true, +// }, +// KubernetesMeta: &scheduler.KubernetesMeta{ +// Namespace: "default", +// Generation: 1, +// }, +// }, +// Deleted: false, +// }, +// expected: &Experiment{ +// Name: "foo", +// Active: false, +// Deleted: false, +// Default: getStrPtr("model1"), +// ResourceType: ModelResourceType, +// Candidates: []*Candidate{ +// { +// Name: "model1", +// Weight: 20, +// }, +// { +// Name: "model3", +// Weight: 20, +// }, +// }, +// Mirror: &Mirror{ +// Name: "model4", +// Percent: 80, +// }, +// Config: &Config{ +// StickySessions: true, +// }, +// KubernetesMeta: &KubernetesMeta{ +// Namespace: "default", +// Generation: 1, +// }, +// }, +// }, +// { +// name: "deleted experiment", +// proto: &scheduler.ExperimentSnapshot{ +// Experiment: &scheduler.Experiment{ +// Name: "foo", +// Default: getStrPtr("model1"), +// Candidates: []*scheduler.ExperimentCandidate{ +// { +// Name: "model1", +// Weight: 20, +// }, +// { +// Name: "model3", +// Weight: 20, +// }, +// }, +// Mirror: &scheduler.ExperimentMirror{ +// Name: "model4", +// Percent: 80, +// }, +// Config: &scheduler.ExperimentConfig{ +// StickySessions: true, +// }, +// KubernetesMeta: &scheduler.KubernetesMeta{ +// Namespace: "default", +// Generation: 1, +// }, +// }, +// Deleted: true, +// }, +// expected: &Experiment{ +// Name: "foo", +// Active: false, +// Deleted: true, +// Default: getStrPtr("model1"), +// ResourceType: ModelResourceType, +// Candidates: []*Candidate{ +// { +// Name: "model1", +// Weight: 20, +// }, +// { +// Name: "model3", +// Weight: 20, +// }, +// }, +// Mirror: &Mirror{ +// Name: "model4", +// Percent: 80, +// }, +// Config: &Config{ +// StickySessions: true, +// }, +// KubernetesMeta: &KubernetesMeta{ +// Namespace: "default", +// Generation: 1, +// }, +// }, +// }, +// } +// +// for _, test := range tests { +// t.Run(test.name, func(t *testing.T) { +// experiment := CreateExperimentFromSnapshot(test.proto) +// g.Expect(experiment).To(Equal(test.expected)) +// }) +// } +// +//} diff --git a/scheduler/pkg/store/mock/store.go b/scheduler/pkg/store/mock/store.go index 0e773d90aa..1b849c9181 100644 --- a/scheduler/pkg/store/mock/store.go +++ b/scheduler/pkg/store/mock/store.go @@ -1,12 +1,3 @@ -/* -Copyright (c) 2024 Seldon Technologies Ltd. - -Use of this software is governed BY -(1) the license included in the LICENSE file or -(2) if the license included in the LICENSE file is the Business Source License 1.1, -the Change License after the Change Date as each is defined in accordance with the LICENSE file. -*/ - // Code generated by MockGen. DO NOT EDIT. // Source: ./api.go // @@ -95,17 +86,17 @@ func (mr *MockModelServerAPIMockRecorder) EmitEvents() *gomock.Call { } // FailedScheduling mocks base method. -func (m *MockModelServerAPI) FailedScheduling(modelID string, version uint32, reason string, reset bool) error { +func (m *MockModelServerAPI) FailedScheduling(modelName string, version uint32, reason string, reset bool) error { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "FailedScheduling", modelID, version, reason, reset) + ret := m.ctrl.Call(m, "FailedScheduling", modelName, version, reason, reset) ret0, _ := ret[0].(error) return ret0 } // FailedScheduling indicates an expected call of FailedScheduling. -func (mr *MockModelServerAPIMockRecorder) FailedScheduling(modelID, version, reason, reset any) *gomock.Call { +func (mr *MockModelServerAPIMockRecorder) FailedScheduling(modelName, version, reason, reset any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FailedScheduling", reflect.TypeOf((*MockModelServerAPI)(nil).FailedScheduling), modelID, version, reason, reset) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "FailedScheduling", reflect.TypeOf((*MockModelServerAPI)(nil).FailedScheduling), modelName, version, reason, reset) } // GetAllModels mocks base method. @@ -154,9 +145,9 @@ func (mr *MockModelServerAPIMockRecorder) GetModels() *gomock.Call { } // GetServer mocks base method. -func (m *MockModelServerAPI) GetServer(serverKey string, shallow, modelDetails bool) (*db.Server, *store.ServerStats, error) { +func (m *MockModelServerAPI) GetServer(serverName string, modelDetails bool) (*db.Server, *store.ServerStats, error) { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "GetServer", serverKey, shallow, modelDetails) + ret := m.ctrl.Call(m, "GetServer", serverName, modelDetails) ret0, _ := ret[0].(*db.Server) ret1, _ := ret[1].(*store.ServerStats) ret2, _ := ret[2].(error) @@ -164,36 +155,48 @@ func (m *MockModelServerAPI) GetServer(serverKey string, shallow, modelDetails b } // GetServer indicates an expected call of GetServer. -func (mr *MockModelServerAPIMockRecorder) GetServer(serverKey, shallow, modelDetails any) *gomock.Call { +func (mr *MockModelServerAPIMockRecorder) GetServer(serverName, modelDetails any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetServer", reflect.TypeOf((*MockModelServerAPI)(nil).GetServer), serverKey, shallow, modelDetails) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetServer", reflect.TypeOf((*MockModelServerAPI)(nil).GetServer), serverName, modelDetails) } // GetServers mocks base method. -func (m *MockModelServerAPI) GetServers(shallow, modelDetails bool) ([]*db.Server, error) { +func (m *MockModelServerAPI) GetServers() ([]*db.Server, error) { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "GetServers", shallow, modelDetails) + ret := m.ctrl.Call(m, "GetServers") ret0, _ := ret[0].([]*db.Server) ret1, _ := ret[1].(error) return ret0, ret1 } // GetServers indicates an expected call of GetServers. -func (mr *MockModelServerAPIMockRecorder) GetServers(shallow, modelDetails any) *gomock.Call { +func (mr *MockModelServerAPIMockRecorder) GetServers() *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetServers", reflect.TypeOf((*MockModelServerAPI)(nil).GetServers), shallow, modelDetails) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetServers", reflect.TypeOf((*MockModelServerAPI)(nil).GetServers)) } // LockModel mocks base method. -func (m *MockModelServerAPI) LockModel(modelId string) { +func (m *MockModelServerAPI) LockModel(modelName string) { m.ctrl.T.Helper() - m.ctrl.Call(m, "LockModel", modelId) + m.ctrl.Call(m, "LockModel", modelName) } // LockModel indicates an expected call of LockModel. -func (mr *MockModelServerAPIMockRecorder) LockModel(modelId any) *gomock.Call { +func (mr *MockModelServerAPIMockRecorder) LockModel(modelName any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "LockModel", reflect.TypeOf((*MockModelServerAPI)(nil).LockModel), modelName) +} + +// LockServer mocks base method. +func (m *MockModelServerAPI) LockServer(serverName string) { + m.ctrl.T.Helper() + m.ctrl.Call(m, "LockServer", serverName) +} + +// LockServer indicates an expected call of LockServer. +func (mr *MockModelServerAPIMockRecorder) LockServer(serverName any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "LockModel", reflect.TypeOf((*MockModelServerAPI)(nil).LockModel), modelId) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "LockServer", reflect.TypeOf((*MockModelServerAPI)(nil).LockServer), serverName) } // RemoveModel mocks base method. @@ -240,73 +243,85 @@ func (mr *MockModelServerAPIMockRecorder) ServerNotify(request any) *gomock.Call } // SetModelGwModelState mocks base method. -func (m *MockModelServerAPI) SetModelGwModelState(name string, versionNumber uint32, status db.ModelState, reason, source string) error { +func (m *MockModelServerAPI) SetModelGwModelState(modelName string, versionNumber uint32, status db.ModelState, reason, source string) error { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "SetModelGwModelState", name, versionNumber, status, reason, source) + ret := m.ctrl.Call(m, "SetModelGwModelState", modelName, versionNumber, status, reason, source) ret0, _ := ret[0].(error) return ret0 } // SetModelGwModelState indicates an expected call of SetModelGwModelState. -func (mr *MockModelServerAPIMockRecorder) SetModelGwModelState(name, versionNumber, status, reason, source any) *gomock.Call { +func (mr *MockModelServerAPIMockRecorder) SetModelGwModelState(modelName, versionNumber, status, reason, source any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetModelGwModelState", reflect.TypeOf((*MockModelServerAPI)(nil).SetModelGwModelState), name, versionNumber, status, reason, source) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetModelGwModelState", reflect.TypeOf((*MockModelServerAPI)(nil).SetModelGwModelState), modelName, versionNumber, status, reason, source) } // UnloadModelGwVersionModels mocks base method. -func (m *MockModelServerAPI) UnloadModelGwVersionModels(modelKey string, version uint32) (bool, error) { +func (m *MockModelServerAPI) UnloadModelGwVersionModels(modelName string, version uint32) (bool, error) { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "UnloadModelGwVersionModels", modelKey, version) + ret := m.ctrl.Call(m, "UnloadModelGwVersionModels", modelName, version) ret0, _ := ret[0].(bool) ret1, _ := ret[1].(error) return ret0, ret1 } // UnloadModelGwVersionModels indicates an expected call of UnloadModelGwVersionModels. -func (mr *MockModelServerAPIMockRecorder) UnloadModelGwVersionModels(modelKey, version any) *gomock.Call { +func (mr *MockModelServerAPIMockRecorder) UnloadModelGwVersionModels(modelName, version any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UnloadModelGwVersionModels", reflect.TypeOf((*MockModelServerAPI)(nil).UnloadModelGwVersionModels), modelKey, version) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UnloadModelGwVersionModels", reflect.TypeOf((*MockModelServerAPI)(nil).UnloadModelGwVersionModels), modelName, version) } // UnloadVersionModels mocks base method. -func (m *MockModelServerAPI) UnloadVersionModels(modelKey string, version uint32) (bool, error) { +func (m *MockModelServerAPI) UnloadVersionModels(modelName string, version uint32) (bool, error) { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "UnloadVersionModels", modelKey, version) + ret := m.ctrl.Call(m, "UnloadVersionModels", modelName, version) ret0, _ := ret[0].(bool) ret1, _ := ret[1].(error) return ret0, ret1 } // UnloadVersionModels indicates an expected call of UnloadVersionModels. -func (mr *MockModelServerAPIMockRecorder) UnloadVersionModels(modelKey, version any) *gomock.Call { +func (mr *MockModelServerAPIMockRecorder) UnloadVersionModels(modelName, version any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UnloadVersionModels", reflect.TypeOf((*MockModelServerAPI)(nil).UnloadVersionModels), modelKey, version) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UnloadVersionModels", reflect.TypeOf((*MockModelServerAPI)(nil).UnloadVersionModels), modelName, version) } // UnlockModel mocks base method. -func (m *MockModelServerAPI) UnlockModel(modelId string) { +func (m *MockModelServerAPI) UnlockModel(modelName string) { m.ctrl.T.Helper() - m.ctrl.Call(m, "UnlockModel", modelId) + m.ctrl.Call(m, "UnlockModel", modelName) } // UnlockModel indicates an expected call of UnlockModel. -func (mr *MockModelServerAPIMockRecorder) UnlockModel(modelId any) *gomock.Call { +func (mr *MockModelServerAPIMockRecorder) UnlockModel(modelName any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UnlockModel", reflect.TypeOf((*MockModelServerAPI)(nil).UnlockModel), modelName) +} + +// UnlockServer mocks base method. +func (m *MockModelServerAPI) UnlockServer(serverName string) { + m.ctrl.T.Helper() + m.ctrl.Call(m, "UnlockServer", serverName) +} + +// UnlockServer indicates an expected call of UnlockServer. +func (mr *MockModelServerAPIMockRecorder) UnlockServer(serverName any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UnlockModel", reflect.TypeOf((*MockModelServerAPI)(nil).UnlockModel), modelId) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UnlockServer", reflect.TypeOf((*MockModelServerAPI)(nil).UnlockServer), serverName) } // UpdateLoadedModels mocks base method. -func (m *MockModelServerAPI) UpdateLoadedModels(modelKey string, version uint32, serverKey string, replicas []*db.ServerReplica) error { +func (m *MockModelServerAPI) UpdateLoadedModels(modelName string, version uint32, serverKey string, replicas []*db.ServerReplica) error { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "UpdateLoadedModels", modelKey, version, serverKey, replicas) + ret := m.ctrl.Call(m, "UpdateLoadedModels", modelName, version, serverKey, replicas) ret0, _ := ret[0].(error) return ret0 } // UpdateLoadedModels indicates an expected call of UpdateLoadedModels. -func (mr *MockModelServerAPIMockRecorder) UpdateLoadedModels(modelKey, version, serverKey, replicas any) *gomock.Call { +func (mr *MockModelServerAPIMockRecorder) UpdateLoadedModels(modelName, version, serverKey, replicas any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateLoadedModels", reflect.TypeOf((*MockModelServerAPI)(nil).UpdateLoadedModels), modelKey, version, serverKey, replicas) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateLoadedModels", reflect.TypeOf((*MockModelServerAPI)(nil).UpdateLoadedModels), modelName, version, serverKey, replicas) } // UpdateModel mocks base method. @@ -324,15 +339,15 @@ func (mr *MockModelServerAPIMockRecorder) UpdateModel(config any) *gomock.Call { } // UpdateModelState mocks base method. -func (m *MockModelServerAPI) UpdateModelState(modelKey string, version uint32, serverKey string, replicaIdx int, availableMemory *uint64, expectedState, desiredState db.ModelReplicaState, reason string, runtimeInfo *scheduler.ModelRuntimeInfo) error { +func (m *MockModelServerAPI) UpdateModelState(modelName string, version uint32, serverKey string, replicaIdx int, availableMemory *uint64, expectedState, desiredState db.ModelReplicaState, reason string, runtimeInfo *scheduler.ModelRuntimeInfo) error { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "UpdateModelState", modelKey, version, serverKey, replicaIdx, availableMemory, expectedState, desiredState, reason, runtimeInfo) + ret := m.ctrl.Call(m, "UpdateModelState", modelName, version, serverKey, replicaIdx, availableMemory, expectedState, desiredState, reason, runtimeInfo) ret0, _ := ret[0].(error) return ret0 } // UpdateModelState indicates an expected call of UpdateModelState. -func (mr *MockModelServerAPIMockRecorder) UpdateModelState(modelKey, version, serverKey, replicaIdx, availableMemory, expectedState, desiredState, reason, runtimeInfo any) *gomock.Call { +func (mr *MockModelServerAPIMockRecorder) UpdateModelState(modelName, version, serverKey, replicaIdx, availableMemory, expectedState, desiredState, reason, runtimeInfo any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateModelState", reflect.TypeOf((*MockModelServerAPI)(nil).UpdateModelState), modelKey, version, serverKey, replicaIdx, availableMemory, expectedState, desiredState, reason, runtimeInfo) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateModelState", reflect.TypeOf((*MockModelServerAPI)(nil).UpdateModelState), modelName, version, serverKey, replicaIdx, availableMemory, expectedState, desiredState, reason, runtimeInfo) } From 61517ef492c1aed0907dd6b15bcfed772ebf35c7 Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Mon, 12 Jan 2026 16:26:30 +0000 Subject: [PATCH 33/50] fix envoy processor tests --- .../processor/incremental_benchmark_test.go | 15 +- .../pkg/envoy/processor/incremental_test.go | 358 +++++++++--------- scheduler/pkg/envoy/processor/server_test.go | 10 +- scheduler/pkg/util/model_versions.go | 16 + scheduler/pkg/util/server_replica.go | 44 +++ 5 files changed, 262 insertions(+), 181 deletions(-) create mode 100644 scheduler/pkg/util/model_versions.go create mode 100644 scheduler/pkg/util/server_replica.go diff --git a/scheduler/pkg/envoy/processor/incremental_benchmark_test.go b/scheduler/pkg/envoy/processor/incremental_benchmark_test.go index 1480de30b8..ba4c7aad62 100644 --- a/scheduler/pkg/envoy/processor/incremental_benchmark_test.go +++ b/scheduler/pkg/envoy/processor/incremental_benchmark_test.go @@ -15,6 +15,7 @@ import ( "testing" "time" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/sirupsen/logrus" "github.com/stretchr/testify/require" @@ -72,11 +73,11 @@ func addModel( server, _, err := ip.modelStore.GetServer(serverName, false) require.NoError(b, err) - replicas := []*store.ServerReplica{} - replicaStatuses := make(map[int]store.ReplicaStatus) + replicas := []*db.ServerReplica{} + replicaStatuses := make(map[int]db.ReplicaStatus) for i, r := range server.Replicas { replicas = append(replicas, r) - replicaStatuses[i] = store.ReplicaStatus{State: store.Available} + replicaStatuses[int(i)] = db.ReplicaStatus{State: db.ModelReplicaState_Available} } err = ip.modelStore.UpdateLoadedModels( @@ -95,8 +96,8 @@ func addModel( server.Name, replicaIdx, nil, - store.LoadRequested, - store.Loaded, + db.ModelReplicaState_LoadRequested, + db.ModelReplicaState_Loaded, "", nil, ) @@ -122,7 +123,9 @@ func benchmarkModelUpdate( eventHub, err := coordinator.NewEventHub(logger) require.NoError(b, err) - memoryStore := store.NewModelServerStore(logger, store.NewLocalSchedulerStore(), eventHub) + modelStorage := store.NewInMemoryStorage[*db.Model]() + serverStorage := store.NewInMemoryStorage[*db.Server]() + memoryStore := store.NewModelServerStore(logger, modelStorage, serverStorage, eventHub) pipelineStore := pipeline.NewPipelineStore(logger, eventHub, memoryStore) ip, err := NewIncrementalProcessor( "some node", diff --git a/scheduler/pkg/envoy/processor/incremental_test.go b/scheduler/pkg/envoy/processor/incremental_test.go index 091786572c..2decda813d 100644 --- a/scheduler/pkg/envoy/processor/incremental_test.go +++ b/scheduler/pkg/envoy/processor/incremental_test.go @@ -10,6 +10,7 @@ the Change License after the Change Date as each is defined in accordance with t package processor import ( + "context" "encoding/json" "flag" "fmt" @@ -23,6 +24,7 @@ import ( "github.com/envoyproxy/go-control-plane/pkg/cache/types" . "github.com/onsi/gomega" "github.com/onsi/gomega/format" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" log "github.com/sirupsen/logrus" "google.golang.org/protobuf/encoding/protojson" @@ -47,8 +49,8 @@ func TestGetTrafficShare(t *testing.T) { g := NewGomegaWithT(t) type test struct { name string - latestModel *store.ModelVersion - lastAvailableModel *store.ModelVersion + latestModel *db.ModelVersion + lastAvailableModel *db.ModelVersion weight uint32 expectedLatestModelWeight uint32 expectedLastAvailableModelWeight uint32 @@ -57,73 +59,74 @@ func TestGetTrafficShare(t *testing.T) { tests := []test{ { name: "50 - 50", - latestModel: store.NewModelVersion(nil, 1, "server", map[int]store.ReplicaStatus{ + latestModel: util.NewTestModelVersion(nil, 1, "server", map[int32]*db.ReplicaStatus{ 1: { - State: store.Available, + State: db.ModelReplicaState_Available, }, - }, false, store.ModelAvailable), - lastAvailableModel: store.NewModelVersion(nil, 1, "server", map[int]store.ReplicaStatus{ + }, db.ModelState_ModelAvailable), + lastAvailableModel: util.NewTestModelVersion(nil, 1, "server", map[int32]*db.ReplicaStatus{ 1: { - State: store.Available, + State: db.ModelReplicaState_Available, }, - }, false, store.ModelAvailable), + }, db.ModelState_ModelAvailable, + ), weight: 100, expectedLatestModelWeight: 50, expectedLastAvailableModelWeight: 50, }, { name: "2 latest replicas to 1 last available", - latestModel: store.NewModelVersion(nil, 1, "server", map[int]store.ReplicaStatus{ + latestModel: util.NewTestModelVersion(nil, 1, "server", map[int32]*db.ReplicaStatus{ 1: { - State: store.Available, + State: db.ModelReplicaState_Available, }, 2: { - State: store.Available, + State: db.ModelReplicaState_Available, }, - }, false, store.ModelAvailable), - lastAvailableModel: store.NewModelVersion(nil, 1, "server", map[int]store.ReplicaStatus{ + }, db.ModelState_ModelAvailable), + lastAvailableModel: util.NewTestModelVersion(nil, 1, "server", map[int32]*db.ReplicaStatus{ 1: { - State: store.Available, + State: db.ModelReplicaState_Available, }, - }, false, store.ModelAvailable), + }, db.ModelState_ModelAvailable), weight: 100, expectedLatestModelWeight: 67, expectedLastAvailableModelWeight: 33, }, { name: "1 latest replicas to 2 last available", - latestModel: store.NewModelVersion(nil, 1, "server", map[int]store.ReplicaStatus{ + latestModel: util.NewTestModelVersion(nil, 1, "server", map[int32]*db.ReplicaStatus{ 1: { - State: store.Available, + State: db.ModelReplicaState_Available, }, - }, false, store.ModelAvailable), - lastAvailableModel: store.NewModelVersion(nil, 1, "server", map[int]store.ReplicaStatus{ + }, db.ModelState_ModelAvailable), + lastAvailableModel: util.NewTestModelVersion(nil, 1, "server", map[int32]*db.ReplicaStatus{ 1: { - State: store.Available, + State: db.ModelReplicaState_Available, }, 2: { - State: store.Available, + State: db.ModelReplicaState_Available, }, - }, false, store.ModelAvailable), + }, db.ModelState_ModelAvailable), weight: 100, expectedLatestModelWeight: 34, expectedLastAvailableModelWeight: 66, }, { name: "model failed so all to latest", - latestModel: store.NewModelVersion(nil, 1, "server", map[int]store.ReplicaStatus{ + latestModel: util.NewTestModelVersion(nil, 1, "server", map[int32]*db.ReplicaStatus{ 1: { - State: store.Available, + State: db.ModelReplicaState_Available, }, 2: { - State: store.Available, + State: db.ModelReplicaState_Available, }, - }, false, store.ModelAvailable), - lastAvailableModel: store.NewModelVersion(nil, 1, "server", map[int]store.ReplicaStatus{ + }, db.ModelState_ModelAvailable), + lastAvailableModel: util.NewTestModelVersion(nil, 1, "server", map[int32]*db.ReplicaStatus{ 1: { - State: store.LoadFailed, + State: db.ModelReplicaState_LoadFailed, }, - }, false, store.ModelAvailable), + }, db.ModelState_ModelAvailable), weight: 100, expectedLatestModelWeight: 100, expectedLastAvailableModelWeight: 0, @@ -142,8 +145,8 @@ func TestUpdateEnvoyForModelVersion(t *testing.T) { g := NewGomegaWithT(t) type test struct { name string - modelVersions []*store.ModelVersion - server *store.ServerSnapshot + modelVersions []*db.ModelVersion + server *db.Server traffic uint32 expectedRoutes int expectedClusters int @@ -152,25 +155,24 @@ func TestUpdateEnvoyForModelVersion(t *testing.T) { tests := []test{ { name: "Simple", - modelVersions: []*store.ModelVersion{ - store.NewModelVersion( + modelVersions: []*db.ModelVersion{ + util.NewTestModelVersion( &scheduler.Model{ Meta: &scheduler.MetaData{Name: "foo"}, DeploymentSpec: &scheduler.DeploymentSpec{LogPayloads: false}, }, 1, "server", - map[int]store.ReplicaStatus{ - 1: {State: store.Loaded}, + map[int32]*db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_Loaded}, }, - false, - store.ModelAvailable, + db.ModelState_ModelAvailable, ), }, - server: &store.ServerSnapshot{ + server: &db.Server{ Name: "server", - Replicas: map[int]*store.ServerReplica{ - 1: store.NewServerReplica("host1", 8080, 5000, 1, store.NewServer("server", false), nil, 100, 100, 0, nil, 100), + Replicas: map[int32]*db.ServerReplica{ + 1: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server", false), nil, 100, 100, 0, nil, 100), }, }, traffic: 100, @@ -179,26 +181,25 @@ func TestUpdateEnvoyForModelVersion(t *testing.T) { }, { name: "With one replica unloading", - modelVersions: []*store.ModelVersion{ - store.NewModelVersion( + modelVersions: []*db.ModelVersion{ + util.NewTestModelVersion( &scheduler.Model{ Meta: &scheduler.MetaData{Name: "foo"}, DeploymentSpec: &scheduler.DeploymentSpec{LogPayloads: false}, }, 2, "server", - map[int]store.ReplicaStatus{ - 1: {State: store.Loaded}, - 2: {State: store.UnloadEnvoyRequested}, + map[int32]*db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_Loaded}, + 2: {State: db.ModelReplicaState_UnloadEnvoyRequested}, }, - false, - store.ModelAvailable, + db.ModelState_ModelAvailable, ), }, - server: &store.ServerSnapshot{ + server: &db.Server{ Name: "server", - Replicas: map[int]*store.ServerReplica{ - 1: store.NewServerReplica("host1", 8080, 5000, 1, store.NewServer("server", false), nil, 100, 100, 0, nil, 100), + Replicas: map[int32]*db.ServerReplica{ + 1: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server", false), nil, 100, 100, 0, nil, 100), }, }, traffic: 100, @@ -207,38 +208,36 @@ func TestUpdateEnvoyForModelVersion(t *testing.T) { }, { name: "TwoRoutesSameCluster", - modelVersions: []*store.ModelVersion{ - store.NewModelVersion( + modelVersions: []*db.ModelVersion{ + util.NewTestModelVersion( &scheduler.Model{ Meta: &scheduler.MetaData{Name: "foo"}, DeploymentSpec: &scheduler.DeploymentSpec{LogPayloads: false}, }, 1, "server", - map[int]store.ReplicaStatus{ - 1: {State: store.Loaded}, + map[int32]*db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_Loaded}, }, - false, - store.ModelAvailable, + db.ModelState_ModelAvailable, ), - store.NewModelVersion( + util.NewTestModelVersion( &scheduler.Model{ Meta: &scheduler.MetaData{Name: "bar"}, DeploymentSpec: &scheduler.DeploymentSpec{LogPayloads: false}, }, 1, "server", - map[int]store.ReplicaStatus{ - 1: {State: store.Loaded}, + map[int32]*db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_Loaded}, }, - false, - store.ModelAvailable, + db.ModelState_ModelAvailable, ), }, - server: &store.ServerSnapshot{ + server: &db.Server{ Name: "server", - Replicas: map[int]*store.ServerReplica{ - 1: store.NewServerReplica("host1", 8080, 5000, 1, store.NewServer("server", false), nil, 100, 100, 0, nil, 100), + Replicas: map[int32]*db.ServerReplica{ + 1: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server", false), nil, 100, 100, 0, nil, 100), }, }, traffic: 100, @@ -247,39 +246,37 @@ func TestUpdateEnvoyForModelVersion(t *testing.T) { }, { name: "TwoRoutesDifferentClusters", - modelVersions: []*store.ModelVersion{ - store.NewModelVersion( + modelVersions: []*db.ModelVersion{ + util.NewTestModelVersion( &scheduler.Model{ Meta: &scheduler.MetaData{Name: "foo"}, DeploymentSpec: &scheduler.DeploymentSpec{LogPayloads: false}, }, 1, "server", - map[int]store.ReplicaStatus{ - 1: {State: store.Loaded}, + map[int32]*db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_Loaded}, }, - false, - store.ModelAvailable, + db.ModelState_ModelAvailable, ), - store.NewModelVersion( + util.NewTestModelVersion( &scheduler.Model{ Meta: &scheduler.MetaData{Name: "bar"}, DeploymentSpec: &scheduler.DeploymentSpec{LogPayloads: false}, }, 1, "server", - map[int]store.ReplicaStatus{ - 2: {State: store.Loaded}, + map[int32]*db.ReplicaStatus{ + 2: {State: db.ModelReplicaState_Loaded}, }, - false, - store.ModelAvailable, + db.ModelState_ModelAvailable, ), }, - server: &store.ServerSnapshot{ + server: &db.Server{ Name: "server", - Replicas: map[int]*store.ServerReplica{ - 1: store.NewServerReplica("host1", 8080, 5000, 1, store.NewServer("server", false), nil, 100, 100, 0, nil, 100), - 2: store.NewServerReplica("host2", 8080, 5000, 1, store.NewServer("server", false), nil, 100, 100, 0, nil, 100), + Replicas: map[int32]*db.ServerReplica{ + 1: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server", false), nil, 100, 100, 0, nil, 100), + 2: util.NewTestServerReplica("host2", 8080, 5000, 1, store.NewServer("server", false), nil, 100, 100, 0, nil, 100), }, }, traffic: 100, @@ -297,7 +294,7 @@ func TestUpdateEnvoyForModelVersion(t *testing.T) { xdsCache: xdsCache, } for _, mv := range test.modelVersions { - inc.updateEnvoyForModelVersion(mv.GetMeta().GetName(), mv, test.server, test.traffic, false) + inc.updateEnvoyForModelVersion(mv.ModelDefn.Meta.Name, mv, test.server, test.traffic, false) } g.Expect(inc.xdsCache.Routes.Length()).To(Equal(test.expectedRoutes)) @@ -321,8 +318,8 @@ func TestRollingUpdate(t *testing.T) { name: "Rolling update in progress", ops: []func(inc *IncrementalProcessor, g *WithT){ createTestServer("server", 2), - createTestModel("model", "server", 1, []int{0}, 1, []store.ModelReplicaState{store.Available}), - createTestModel("model", "server", 2, []int{0, 1}, 2, []store.ModelReplicaState{store.Available, store.Loading}), + createTestModel("model", "server", 1, []int{0}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available}), + createTestModel("model", "server", 2, []int{0, 1}, 2, []db.ModelReplicaState{db.ModelReplicaState_Available, db.ModelReplicaState_Loading}), }, numExpectedClusters: 4, numExpectedRoutes: 1, @@ -332,8 +329,8 @@ func TestRollingUpdate(t *testing.T) { name: "Rolling update complete", ops: []func(inc *IncrementalProcessor, g *WithT){ createTestServer("server", 1), - createTestModel("model", "server", 1, []int{0}, 1, []store.ModelReplicaState{store.Available}), - createTestModel("model", "server", 1, []int{0}, 2, []store.ModelReplicaState{store.Available}), + createTestModel("model", "server", 1, []int{0}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available}), + createTestModel("model", "server", 1, []int{0}, 2, []db.ModelReplicaState{db.ModelReplicaState_Available}), }, numExpectedClusters: 2, numExpectedRoutes: 1, @@ -342,7 +339,10 @@ func TestRollingUpdate(t *testing.T) { } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - modelStore := store.NewModelServerStore(log.New(), store.NewLocalSchedulerStore(), nil) + modelStorage := store.NewInMemoryStorage[*db.Model]() + serverStorage := store.NewInMemoryStorage[*db.Server]() + modelStore := store.NewModelServerStore(log.New(), modelStorage, serverStorage, nil) + xdsCache, err := xdscache.NewSeldonXDSCache(log.New(), &xdscache.PipelineGatewayDetails{Host: "pipeline", GrpcPort: 1, HttpPort: 2}, nil) g.Expect(err).To(BeNil()) inc := &IncrementalProcessor{ @@ -380,7 +380,7 @@ func TestDraining(t *testing.T) { numExpectedClusters int numExpectedRoutes int numTrafficSplits map[string]int - expectedModelState map[string]store.ModelState + expectedModelState map[string]db.ModelState } tests := []test{ @@ -388,39 +388,46 @@ func TestDraining(t *testing.T) { name: "Model with draining and available replicas", ops: []func(inc *IncrementalProcessor, g *WithT){ createTestServer("server", 2), - createTestModel("model", "server", 1, []int{0, 1}, 1, []store.ModelReplicaState{store.Available, store.Draining}), + createTestModel("model", "server", 1, []int{0, 1}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available, db.ModelReplicaState_Draining}), }, numExpectedClusters: 2, numExpectedRoutes: 1, numTrafficSplits: map[string]int{"model": 1}, - expectedModelState: map[string]store.ModelState{"model": store.ModelAvailable}, + expectedModelState: map[string]db.ModelState{"model": db.ModelState_ModelAvailable}, }, { name: "Model with draining and loading replicas", ops: []func(inc *IncrementalProcessor, g *WithT){ createTestServer("server", 2), - createTestModel("model", "server", 1, []int{0, 1}, 1, []store.ModelReplicaState{store.Loading, store.Draining}), + createTestModel("model", "server", 1, []int{0, 1}, 1, []db.ModelReplicaState{ + db.ModelReplicaState_Loading, + db.ModelReplicaState_Draining}), }, numExpectedClusters: 2, numExpectedRoutes: 1, numTrafficSplits: map[string]int{"model": 1}, - expectedModelState: map[string]store.ModelState{"model": store.ModelProgressing}, + expectedModelState: map[string]db.ModelState{"model": db.ModelState_ModelProgressing}, }, { name: "Model load failed during draining so failed", ops: []func(inc *IncrementalProcessor, g *WithT){ createTestServer("server", 2), - createTestModel("model", "server", 1, []int{0, 1}, 1, []store.ModelReplicaState{store.LoadFailed, store.Draining}), + createTestModel("model", "server", 1, []int{0, 1}, 1, []db.ModelReplicaState{ + db.ModelReplicaState_LoadFailed, + db.ModelReplicaState_Draining}), }, numExpectedClusters: 2, numExpectedRoutes: 1, numTrafficSplits: map[string]int{"model": 1}, - expectedModelState: map[string]store.ModelState{"model": store.ModelFailed}, + expectedModelState: map[string]db.ModelState{"model": db.ModelState_ModelFailed}, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - modelStore := store.NewModelServerStore(log.New(), store.NewLocalSchedulerStore(), nil) + modelStorage := store.NewInMemoryStorage[*db.Model]() + serverStorage := store.NewInMemoryStorage[*db.Server]() + modelStore := store.NewModelServerStore(log.New(), modelStorage, serverStorage, nil) + xdsCache, err := xdscache.NewSeldonXDSCache(log.New(), &xdscache.PipelineGatewayDetails{Host: "pipeline", GrpcPort: 1, HttpPort: 2}, nil) g.Expect(err).To(BeNil()) inc := &IncrementalProcessor{ @@ -441,7 +448,7 @@ func TestDraining(t *testing.T) { for modelName, modelState := range test.expectedModelState { model, err := inc.modelStore.GetModel(modelName) g.Expect(err).To(BeNil()) - g.Expect(model.GetLatest().ModelState().State).To(Equal(modelState)) + g.Expect(model.Latest().State.State).To(Equal(modelState)) } }) } @@ -453,8 +460,8 @@ func TestModelSync(t *testing.T) { name string ops []func(proc *IncrementalProcessor, g *WithT) pendingModelVersions []*pendingModelVersion - expectedReplicaStats map[string]map[int]store.ModelReplicaState - expectedModelState map[string]store.ModelState + expectedReplicaStats map[string]map[int]db.ModelReplicaState + expectedModelState map[string]db.ModelState } tests := []test{ @@ -462,109 +469,112 @@ func TestModelSync(t *testing.T) { name: "test loaded", ops: []func(inc *IncrementalProcessor, g *WithT){ createTestServer("server", 2), - createTestModel("model", "server", 1, []int{0}, 1, []store.ModelReplicaState{store.Loaded}), + createTestModel("model", "server", 1, []int{0}, 1, []db.ModelReplicaState{db.ModelReplicaState_Loaded}), }, pendingModelVersions: []*pendingModelVersion{ {name: "model", version: 1}, }, - expectedReplicaStats: map[string]map[int]store.ModelReplicaState{"model": {0: store.Available}}, - expectedModelState: map[string]store.ModelState{"model": store.ModelAvailable}, + expectedReplicaStats: map[string]map[int]db.ModelReplicaState{"model": {0: db.ModelReplicaState_Available}}, + expectedModelState: map[string]db.ModelState{"model": db.ModelState_ModelAvailable}, }, { name: "test draining", ops: []func(inc *IncrementalProcessor, g *WithT){ createTestServer("server", 2), - createTestModel("model", "server", 1, []int{0}, 1, []store.ModelReplicaState{store.Draining}), + createTestModel("model", "server", 1, []int{0}, 1, []db.ModelReplicaState{db.ModelReplicaState_Draining}), }, - expectedReplicaStats: map[string]map[int]store.ModelReplicaState{"model": {0: store.Draining}}, - expectedModelState: map[string]store.ModelState{"model": store.ModelProgressing}, + expectedReplicaStats: map[string]map[int]db.ModelReplicaState{"model": {0: db.ModelReplicaState_Draining}}, + expectedModelState: map[string]db.ModelState{"model": db.ModelState_ModelProgressing}, }, { name: "test draining multiple replicas with other loaded", ops: []func(inc *IncrementalProcessor, g *WithT){ createTestServer("server", 2), - createTestModel("model", "server", 1, []int{0, 1}, 1, []store.ModelReplicaState{store.Draining, store.Loaded}), + createTestModel("model", "server", 1, []int{0, 1}, 1, []db.ModelReplicaState{db.ModelReplicaState_Draining, db.ModelReplicaState_Loaded}), }, - expectedReplicaStats: map[string]map[int]store.ModelReplicaState{"model": {0: store.Draining, 1: store.Available}}, - expectedModelState: map[string]store.ModelState{"model": store.ModelAvailable}, + expectedReplicaStats: map[string]map[int]db.ModelReplicaState{"model": {0: db.ModelReplicaState_Draining, 1: db.ModelReplicaState_Available}}, + expectedModelState: map[string]db.ModelState{"model": db.ModelState_ModelAvailable}, }, { name: "test draining multiple replicas with other loading", ops: []func(inc *IncrementalProcessor, g *WithT){ createTestServer("server", 2), - createTestModel("model", "server", 1, []int{0, 1}, 1, []store.ModelReplicaState{store.Draining, store.Loading}), + createTestModel("model", "server", 1, []int{0, 1}, 1, []db.ModelReplicaState{db.ModelReplicaState_Draining, db.ModelReplicaState_Loading}), }, - expectedReplicaStats: map[string]map[int]store.ModelReplicaState{"model": {0: store.Draining, 1: store.Loading}}, - expectedModelState: map[string]store.ModelState{"model": store.ModelProgressing}, + expectedReplicaStats: map[string]map[int]db.ModelReplicaState{"model": {0: db.ModelReplicaState_Draining, 1: db.ModelReplicaState_Loading}}, + expectedModelState: map[string]db.ModelState{"model": db.ModelState_ModelProgressing}, }, { name: "loaded unavailable turns to available", ops: []func(inc *IncrementalProcessor, g *WithT){ createTestServer("server", 2), - createTestModel("model", "server", 1, []int{0}, 1, []store.ModelReplicaState{store.LoadedUnavailable}), + createTestModel("model", "server", 1, []int{0}, 1, []db.ModelReplicaState{db.ModelReplicaState_LoadedUnavailable}), }, - expectedReplicaStats: map[string]map[int]store.ModelReplicaState{"model": {0: store.Available}}, - expectedModelState: map[string]store.ModelState{"model": store.ModelAvailable}, + expectedReplicaStats: map[string]map[int]db.ModelReplicaState{"model": {0: db.ModelReplicaState_Available}}, + expectedModelState: map[string]db.ModelState{"model": db.ModelState_ModelAvailable}, }, { name: "load failed", ops: []func(inc *IncrementalProcessor, g *WithT){ createTestServer("server", 2), - createTestModel("model", "server", 1, []int{0}, 1, []store.ModelReplicaState{store.LoadFailed}), + createTestModel("model", "server", 1, []int{0}, 1, []db.ModelReplicaState{db.ModelReplicaState_LoadFailed}), }, - expectedReplicaStats: map[string]map[int]store.ModelReplicaState{"model": {0: store.LoadFailed}}, - expectedModelState: map[string]store.ModelState{"model": store.ModelFailed}, + expectedReplicaStats: map[string]map[int]db.ModelReplicaState{"model": {0: db.ModelReplicaState_LoadFailed}}, + expectedModelState: map[string]db.ModelState{"model": db.ModelState_ModelFailed}, }, { name: "loading - 1 of 2 replicas loaded", ops: []func(inc *IncrementalProcessor, g *WithT){ createTestServer("server", 2), - createTestModel("model", "server", 2, []int{0, 1}, 1, []store.ModelReplicaState{store.Loaded, store.Loading}), + createTestModel("model", "server", 2, []int{0, 1}, 1, []db.ModelReplicaState{db.ModelReplicaState_Loaded, db.ModelReplicaState_Loading}), }, - expectedReplicaStats: map[string]map[int]store.ModelReplicaState{"model": {0: store.Available, 1: store.Loading}}, - expectedModelState: map[string]store.ModelState{"model": store.ModelProgressing}, + expectedReplicaStats: map[string]map[int]db.ModelReplicaState{"model": {0: db.ModelReplicaState_Available, 1: db.ModelReplicaState_Loading}}, + expectedModelState: map[string]db.ModelState{"model": db.ModelState_ModelProgressing}, }, { name: "load failed on 1 replica", ops: []func(inc *IncrementalProcessor, g *WithT){ createTestServer("server", 2), - createTestModel("model", "server", 2, []int{0, 1}, 1, []store.ModelReplicaState{store.LoadFailed, store.Available}), + createTestModel("model", "server", 2, []int{0, 1}, 1, []db.ModelReplicaState{db.ModelReplicaState_LoadFailed, db.ModelReplicaState_Available}), }, - expectedReplicaStats: map[string]map[int]store.ModelReplicaState{"model": {0: store.LoadFailed, 1: store.Available}}, - expectedModelState: map[string]store.ModelState{"model": store.ModelFailed}, + expectedReplicaStats: map[string]map[int]db.ModelReplicaState{"model": {0: db.ModelReplicaState_LoadFailed, 1: db.ModelReplicaState_Available}}, + expectedModelState: map[string]db.ModelState{"model": db.ModelState_ModelFailed}, }, { name: "unload failed on 1 replica", ops: []func(inc *IncrementalProcessor, g *WithT){ createTestServer("server", 2), - createTestModel("model", "server", 1, []int{0, 1}, 1, []store.ModelReplicaState{store.UnloadFailed, store.Available}), + createTestModel("model", "server", 1, []int{0, 1}, 1, []db.ModelReplicaState{db.ModelReplicaState_UnloadFailed, db.ModelReplicaState_Available}), }, - expectedReplicaStats: map[string]map[int]store.ModelReplicaState{"model": {0: store.UnloadFailed, 1: store.Available}}, - expectedModelState: map[string]store.ModelState{"model": store.ModelAvailable}, + expectedReplicaStats: map[string]map[int]db.ModelReplicaState{"model": {0: db.ModelReplicaState_UnloadFailed, 1: db.ModelReplicaState_Available}}, + expectedModelState: map[string]db.ModelState{"model": db.ModelState_ModelAvailable}, }, { name: "UnloadEnvoyRequest - model being deleted", ops: []func(inc *IncrementalProcessor, g *WithT){ createTestServer("server", 2), - createTestModel("model", "server", 0, []int{0}, 1, []store.ModelReplicaState{store.UnloadEnvoyRequested}), + createTestModel("model", "server", 0, []int{0}, 1, []db.ModelReplicaState{db.ModelReplicaState_UnloadRequested}), }, - expectedReplicaStats: map[string]map[int]store.ModelReplicaState{"model": {0: store.UnloadRequested}}, + expectedReplicaStats: map[string]map[int]db.ModelReplicaState{"model": {0: db.ModelReplicaState_UnloadRequested}}, // note: model state removed here as this case can only happen when model is deleted, which we cannot simulate in this test. - expectedModelState: map[string]store.ModelState{}, + expectedModelState: map[string]db.ModelState{}, }, { name: "UnloadEnvoyRequest - model available", ops: []func(inc *IncrementalProcessor, g *WithT){ createTestServer("server", 2), - createTestModel("model", "server", 1, []int{0, 1}, 1, []store.ModelReplicaState{store.UnloadEnvoyRequested, store.Available}), + createTestModel("model", "server", 1, []int{0, 1}, 1, []db.ModelReplicaState{db.ModelReplicaState_UnloadRequested, db.ModelReplicaState_Available}), }, - expectedReplicaStats: map[string]map[int]store.ModelReplicaState{"model": {0: store.UnloadRequested, 1: store.Available}}, - expectedModelState: map[string]store.ModelState{"model": store.ModelAvailable}, + expectedReplicaStats: map[string]map[int]db.ModelReplicaState{"model": {0: db.ModelReplicaState_UnloadRequested, 1: db.ModelReplicaState_Available}}, + expectedModelState: map[string]db.ModelState{"model": db.ModelState_ModelAvailable}, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - modelStore := store.NewModelServerStore(log.New(), store.NewLocalSchedulerStore(), nil) + + modelStorage := store.NewInMemoryStorage[*db.Model]() + serverStorage := store.NewInMemoryStorage[*db.Server]() + modelStore := store.NewModelServerStore(log.New(), modelStorage, serverStorage, nil) xdsCache, err := xdscache.NewSeldonXDSCache(log.New(), &xdscache.PipelineGatewayDetails{Host: "pipeline", GrpcPort: 1, HttpPort: 2}, nil) g.Expect(err).To(BeNil()) inc := &IncrementalProcessor{ @@ -580,16 +590,16 @@ func TestModelSync(t *testing.T) { } inc.modelSyncWithLock() for modelName, modelReplicas := range test.expectedReplicaStats { - model, err := inc.modelStore.GetModel(modelName) + model, err := modelStorage.Get(context.TODO(), modelName) g.Expect(err).To(BeNil()) for replicaIdx, replicaState := range modelReplicas { - g.Expect(model.GetLatest().ReplicaState()[replicaIdx].State).To(Equal(replicaState)) + g.Expect(model.Latest().ReplicaState()[replicaIdx].State).To(Equal(replicaState)) } } for modelName, modelState := range test.expectedModelState { - model, err := inc.modelStore.GetModel(modelName) + model, err := modelStorage.Get(context.TODO(), modelName) g.Expect(err).To(BeNil()) - g.Expect(model.GetLatest().ModelState().State).To(Equal(modelState)) + g.Expect(model.Latest().State.State).To(Equal(modelState)) } }) } @@ -618,8 +628,8 @@ func TestEnvoySettings(t *testing.T) { name: "experiment with deleted model", ops: []func(inc *IncrementalProcessor, g *WithT){ createTestServer("server", 2), - createTestModel("model1", "server", 1, []int{0}, 1, []store.ModelReplicaState{store.Available}), - createTestModel("model2", "server", 1, []int{1}, 1, []store.ModelReplicaState{store.Available}), + createTestModel("model1", "server", 1, []int{0}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available}), + createTestModel("model2", "server", 1, []int{1}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available}), createTestExperiment("exp", []string{"model1", "model2"}, getStrPtr("model1"), nil), removeTestModel("model2", 1, "server", 1), }, @@ -633,7 +643,7 @@ func TestEnvoySettings(t *testing.T) { name: "One model", ops: []func(inc *IncrementalProcessor, g *WithT){ createTestServer("server", 1), - createTestModel("model", "server", 1, []int{0}, 1, []store.ModelReplicaState{store.Available}), + createTestModel("model", "server", 1, []int{0}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available}), }, numExpectedClusters: 2, numExpectedRoutes: 1, @@ -643,8 +653,8 @@ func TestEnvoySettings(t *testing.T) { name: "two models", ops: []func(inc *IncrementalProcessor, g *WithT){ createTestServer("server", 2), - createTestModel("model1", "server", 1, []int{0}, 1, []store.ModelReplicaState{store.Available}), - createTestModel("model2", "server", 1, []int{1}, 1, []store.ModelReplicaState{store.Available}), + createTestModel("model1", "server", 1, []int{0}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available}), + createTestModel("model2", "server", 1, []int{1}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available}), }, numExpectedClusters: 4, numExpectedRoutes: 2, @@ -654,9 +664,9 @@ func TestEnvoySettings(t *testing.T) { name: "three models - 1 unloading", ops: []func(inc *IncrementalProcessor, g *WithT){ createTestServer("server", 2), - createTestModel("model1", "server", 1, []int{0}, 1, []store.ModelReplicaState{store.Available}), - createTestModel("model2", "server", 1, []int{1}, 1, []store.ModelReplicaState{store.Available}), - createTestModel("model3", "server", 1, []int{1}, 1, []store.ModelReplicaState{store.Unloading}), + createTestModel("model1", "server", 1, []int{0}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available}), + createTestModel("model2", "server", 1, []int{1}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available}), + createTestModel("model3", "server", 1, []int{1}, 1, []db.ModelReplicaState{db.ModelReplicaState_Unloading}), }, numExpectedClusters: 4, numExpectedRoutes: 2, @@ -666,8 +676,8 @@ func TestEnvoySettings(t *testing.T) { name: "experiment", ops: []func(inc *IncrementalProcessor, g *WithT){ createTestServer("server", 2), - createTestModel("model1", "server", 1, []int{0}, 1, []store.ModelReplicaState{store.Available}), - createTestModel("model2", "server", 1, []int{1}, 1, []store.ModelReplicaState{store.Available}), + createTestModel("model1", "server", 1, []int{0}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available}), + createTestModel("model2", "server", 1, []int{1}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available}), createTestExperiment("exp", []string{"model1", "model2"}, getStrPtr("model1"), nil), }, numExpectedClusters: 4, @@ -680,8 +690,8 @@ func TestEnvoySettings(t *testing.T) { name: "experiment - no default", ops: []func(inc *IncrementalProcessor, g *WithT){ createTestServer("server", 2), - createTestModel("model1", "server", 1, []int{0}, 1, []store.ModelReplicaState{store.Available}), - createTestModel("model2", "server", 1, []int{1}, 1, []store.ModelReplicaState{store.Available}), + createTestModel("model1", "server", 1, []int{0}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available}), + createTestModel("model2", "server", 1, []int{1}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available}), createTestExperiment("exp", []string{"model1", "model2"}, nil, nil), }, numExpectedClusters: 4, @@ -698,11 +708,11 @@ func TestEnvoySettings(t *testing.T) { name: "experiment - new model version", ops: []func(inc *IncrementalProcessor, g *WithT){ createTestServer("server", 2), - createTestModel("model1", "server", 1, []int{0}, 1, []store.ModelReplicaState{store.Available}), - createTestModel("model2", "server", 1, []int{1}, 1, []store.ModelReplicaState{store.Available}), + createTestModel("model1", "server", 1, []int{0}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available}), + createTestModel("model2", "server", 1, []int{1}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available}), createTestExperiment("exp", []string{"model1", "model2"}, nil, nil), // update model2 to version 2, will trigger change in routes / experiment - createTestModel("model2", "server", 1, []int{1}, 2, []store.ModelReplicaState{store.Available}), + createTestModel("model2", "server", 1, []int{1}, 2, []db.ModelReplicaState{db.ModelReplicaState_Available}), }, numExpectedClusters: 4, numExpectedRoutes: 3, @@ -719,8 +729,8 @@ func TestEnvoySettings(t *testing.T) { name: "delete experiment", ops: []func(inc *IncrementalProcessor, g *WithT){ createTestServer("server", 2), - createTestModel("model1", "server", 1, []int{0}, 1, []store.ModelReplicaState{store.Available}), - createTestModel("model2", "server", 1, []int{1}, 1, []store.ModelReplicaState{store.Available}), + createTestModel("model1", "server", 1, []int{0}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available}), + createTestModel("model2", "server", 1, []int{1}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available}), createTestExperiment("exp", []string{"model1", "model2"}, getStrPtr("model1"), nil), deleteTestExperiment("exp"), }, @@ -734,8 +744,8 @@ func TestEnvoySettings(t *testing.T) { name: "mirror", ops: []func(inc *IncrementalProcessor, g *WithT){ createTestServer("server", 2), - createTestModel("model1", "server", 1, []int{0}, 1, []store.ModelReplicaState{store.Available}), - createTestModel("model2", "server", 1, []int{1}, 1, []store.ModelReplicaState{store.Available}), + createTestModel("model1", "server", 1, []int{0}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available}), + createTestModel("model2", "server", 1, []int{1}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available}), createTestExperiment("exp", []string{"model1"}, getStrPtr("model1"), getStrPtr("model2")), }, numExpectedClusters: 4, @@ -748,8 +758,8 @@ func TestEnvoySettings(t *testing.T) { name: "mirror, deleted model", ops: []func(inc *IncrementalProcessor, g *WithT){ createTestServer("server", 2), - createTestModel("model1", "server", 1, []int{0}, 1, []store.ModelReplicaState{store.Available}), - createTestModel("model2", "server", 1, []int{1}, 1, []store.ModelReplicaState{store.Available}), + createTestModel("model1", "server", 1, []int{0}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available}), + createTestModel("model2", "server", 1, []int{1}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available}), createTestExperiment("exp", []string{"model1"}, getStrPtr("model1"), getStrPtr("model2")), removeTestModel("model2", 1, "server", 1), }, @@ -763,9 +773,9 @@ func TestEnvoySettings(t *testing.T) { name: "experiment with candidate and mirror", ops: []func(inc *IncrementalProcessor, g *WithT){ createTestServer("server", 2), - createTestModel("model1", "server", 1, []int{0}, 1, []store.ModelReplicaState{store.Available}), - createTestModel("model2", "server", 1, []int{1}, 1, []store.ModelReplicaState{store.Available}), - createTestModel("model3", "server", 1, []int{1}, 1, []store.ModelReplicaState{store.Available}), + createTestModel("model1", "server", 1, []int{0}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available}), + createTestModel("model2", "server", 1, []int{1}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available}), + createTestModel("model3", "server", 1, []int{1}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available}), createTestExperiment("exp", []string{"model1", "model2"}, getStrPtr("model1"), getStrPtr("model3")), }, numExpectedClusters: 6, @@ -778,9 +788,9 @@ func TestEnvoySettings(t *testing.T) { name: "pipeline", ops: []func(inc *IncrementalProcessor, g *WithT){ createTestServer("server", 2), - createTestModel("model1", "server", 1, []int{0}, 1, []store.ModelReplicaState{store.Available}), - createTestModel("model2", "server", 1, []int{1}, 1, []store.ModelReplicaState{store.Available}), - createTestModel("model3", "server", 1, []int{1}, 1, []store.ModelReplicaState{store.Available}), + createTestModel("model1", "server", 1, []int{0}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available}), + createTestModel("model2", "server", 1, []int{1}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available}), + createTestModel("model3", "server", 1, []int{1}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available}), createTestPipeline("pipe", []string{"model1", "model2", "model3"}, 1), }, numExpectedClusters: 8, @@ -792,9 +802,9 @@ func TestEnvoySettings(t *testing.T) { name: "pipeline with removed model", ops: []func(inc *IncrementalProcessor, g *WithT){ createTestServer("server", 2), - createTestModel("model1", "server", 1, []int{0}, 1, []store.ModelReplicaState{store.Available}), - createTestModel("model2", "server", 1, []int{1}, 1, []store.ModelReplicaState{store.Available}), - createTestModel("model3", "server", 1, []int{1}, 1, []store.ModelReplicaState{store.Available}), + createTestModel("model1", "server", 1, []int{0}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available}), + createTestModel("model2", "server", 1, []int{1}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available}), + createTestModel("model3", "server", 1, []int{1}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available}), createTestPipeline("pipe", []string{"model1", "model2", "model3"}, 1), removeTestModel("model2", 1, "server", 1), }, @@ -808,7 +818,11 @@ func TestEnvoySettings(t *testing.T) { t.Run(test.name, func(t *testing.T) { logger := log.New() eventHub, _ := coordinator.NewEventHub(logger) - memoryStore := store.NewModelServerStore(log.New(), store.NewLocalSchedulerStore(), eventHub) + + modelStorage := store.NewInMemoryStorage[*db.Model]() + serverStorage := store.NewInMemoryStorage[*db.Server]() + memoryStore := store.NewModelServerStore(log.New(), modelStorage, serverStorage, eventHub) + xdsCache, err := xdscache.NewSeldonXDSCache(log.New(), &xdscache.PipelineGatewayDetails{Host: "pipeline", GrpcPort: 1, HttpPort: 2}, nil) g.Expect(err).To(BeNil()) inc := &IncrementalProcessor{ @@ -1063,7 +1077,7 @@ func createTestModel(modelName string, desiredReplicas uint32, replicas []int, version uint32, - replicaStates []store.ModelReplicaState, + replicaStates []db.ModelReplicaState, ) func(inc *IncrementalProcessor, g *WithT) { f := func(inc *IncrementalProcessor, g *WithT) { model := &scheduler.Model{ @@ -1079,18 +1093,18 @@ func createTestModel(modelName string, } err := inc.modelStore.UpdateModel(&scheduler.LoadModelRequest{Model: model}) g.Expect(err).To(BeNil()) - var serverReplicas []*store.ServerReplica + var serverReplicas []*db.ServerReplica for _, replicaIdx := range replicas { - var serverReplica *store.ServerReplica + var serverReplica *db.ServerReplica server, _, err := inc.modelStore.GetServer(serverName, true) g.Expect(err).To(BeNil()) if server != nil { - if sr, ok := server.Replicas[replicaIdx]; ok { + if sr, ok := server.Replicas[int32(replicaIdx)]; ok { serverReplica = sr } } if serverReplica == nil { - serverReplica = store.NewServerReplica("", 1, 2, replicaIdx, nil, nil, 1000, 1000, 0, nil, 0) + serverReplica = util.NewTestServerReplica("", 1, 2, int32(replicaIdx), nil, nil, 1000, 1000, 0, nil, 0) } serverReplicas = append(serverReplicas, serverReplica) } @@ -1100,7 +1114,7 @@ func createTestModel(modelName string, g.Expect(err).To(BeNil()) for idx, replicaIdx := range replicas { - err = inc.modelStore.UpdateModelState(modelName, version, serverName, replicaIdx, nil, store.LoadRequested, replicaStates[idx], "", nil) + err = inc.modelStore.UpdateModelState(modelName, version, serverName, replicaIdx, nil, db.ModelReplicaState_LoadRequested, replicaStates[idx], "", nil) g.Expect(err).To(BeNil()) } @@ -1119,7 +1133,7 @@ func removeTestModel( f := func(inc *IncrementalProcessor, g *WithT) { err := inc.modelStore.RemoveModel(&scheduler.UnloadModelRequest{Model: &scheduler.ModelReference{Name: "model1", Version: &version}}) g.Expect(err).To(BeNil()) - err = inc.modelStore.UpdateModelState(modelName, version, serverName, serverIdx, nil, store.Available, store.Unloaded, "", nil) + err = inc.modelStore.UpdateModelState(modelName, version, serverName, serverIdx, nil, db.ModelReplicaState_Available, db.ModelReplicaState_Unloaded, "", nil) g.Expect(err).To(BeNil()) } return f diff --git a/scheduler/pkg/envoy/processor/server_test.go b/scheduler/pkg/envoy/processor/server_test.go index 83c6931ed7..01eb769bb5 100644 --- a/scheduler/pkg/envoy/processor/server_test.go +++ b/scheduler/pkg/envoy/processor/server_test.go @@ -21,6 +21,7 @@ import ( client "github.com/envoyproxy/go-control-plane/pkg/client/sotw/v3" resource "github.com/envoyproxy/go-control-plane/pkg/resource/v3" . "github.com/onsi/gomega" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" log "github.com/sirupsen/logrus" "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" @@ -43,7 +44,10 @@ func TestFetch(t *testing.T) { logger := log.New() - memoryStore := store.NewModelServerStore(logger, store.NewLocalSchedulerStore(), nil) + modelStorage := store.NewInMemoryStorage[*db.Model]() + serverStorage := store.NewInMemoryStorage[*db.Server]() + memoryStore := store.NewModelServerStore(logger, modelStorage, serverStorage, nil) + pipelineHandler := pipeline.NewPipelineStore(logger, nil, memoryStore) xdsCache, err := xdscache.NewSeldonXDSCache(log.New(), &xdscache.PipelineGatewayDetails{}, nil) @@ -88,7 +92,7 @@ func testInitialFetch(g *WithT, inc *IncrementalProcessor, c client.ADSClient) f ops := []func(inc *IncrementalProcessor, g *WithT){ createTestServer("server", 1), - createTestModel("model", "server", 1, []int{0}, 1, []store.ModelReplicaState{store.Available}), + createTestModel("model", "server", 1, []int{0}, 1, []db.ModelReplicaState{db.ModelReplicaState_Available}), } go func() { for _, op := range ops { @@ -119,7 +123,7 @@ func testUpdateModelVersion(g *WithT, inc *IncrementalProcessor, c client.ADSCli return func(t *testing.T) { ops := []func(inc *IncrementalProcessor, g *WithT){ - createTestModel("model", "server", 1, []int{0}, 2, []store.ModelReplicaState{store.Available}), + createTestModel("model", "server", 1, []int{0}, 2, []db.ModelReplicaState{db.ModelReplicaState_Available}), } go func() { for _, op := range ops { diff --git a/scheduler/pkg/util/model_versions.go b/scheduler/pkg/util/model_versions.go new file mode 100644 index 0000000000..029c1f8ba6 --- /dev/null +++ b/scheduler/pkg/util/model_versions.go @@ -0,0 +1,16 @@ +package util + +import ( + pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" +) + +func NewTestModelVersion(model *pb.Model, version uint32, server string, replicas map[int32]*db.ReplicaStatus, state db.ModelState) *db.ModelVersion { + return &db.ModelVersion{ + Version: version, + ModelDefn: model, + Server: server, + Replicas: replicas, + State: &db.ModelStatus{State: state}, + } +} diff --git a/scheduler/pkg/util/server_replica.go b/scheduler/pkg/util/server_replica.go new file mode 100644 index 0000000000..db110a8a6a --- /dev/null +++ b/scheduler/pkg/util/server_replica.go @@ -0,0 +1,44 @@ +package util + +import ( + "strings" + + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" +) + +func NewTestServerReplica(inferenceSvc string, + inferenceHttpPort int32, + inferenceGrpcPort int32, + replicaIdx int32, + server *db.Server, + capabilities []string, + memory, + availableMemory, + reservedMemory uint64, + loadedModels []*db.ModelVersionID, + overCommitPercentage uint32, +) *db.ServerReplica { + cleanCapabilities := func(capabilities []string) []string { + var cleaned []string + for _, capability := range capabilities { + cleaned = append(cleaned, strings.TrimSpace(capability)) + } + return cleaned + } + + return &db.ServerReplica{ + InferenceSvc: inferenceSvc, + InferenceHttpPort: inferenceHttpPort, + InferenceGrpcPort: inferenceGrpcPort, + ServerName: server.Name, + ReplicaIdx: replicaIdx, + Capabilities: cleanCapabilities(capabilities), + Memory: memory, + AvailableMemory: availableMemory, + ReservedMemory: reservedMemory, + LoadedModels: loadedModels, + LoadingModels: make([]*db.ModelVersionID, 0), + OverCommitPercentage: overCommitPercentage, + IsDraining: false, + } +} From 7723b361b75a217bf1a1860c7609db216a7240a1 Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Mon, 12 Jan 2026 16:33:19 +0000 Subject: [PATCH 34/50] fix xdscache tests --- scheduler/pkg/envoy/xdscache/seldoncache_test.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/scheduler/pkg/envoy/xdscache/seldoncache_test.go b/scheduler/pkg/envoy/xdscache/seldoncache_test.go index 2635408cc3..fb54c85813 100644 --- a/scheduler/pkg/envoy/xdscache/seldoncache_test.go +++ b/scheduler/pkg/envoy/xdscache/seldoncache_test.go @@ -22,6 +22,7 @@ import ( matcherv3 "github.com/envoyproxy/go-control-plane/envoy/type/matcher/v3" . "github.com/onsi/gomega" "github.com/otiai10/copy" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" log "github.com/sirupsen/logrus" "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" @@ -570,24 +571,23 @@ func TestAccessLogSettings(t *testing.T) { } func addVersionedRoute(c *SeldonXDSCache, modelRouteName string, modelName string, httpCluster string, grpcCluster string, traffic uint32, version uint32) { - modelVersion := store.NewModelVersion( + modelVersion := util.NewTestModelVersion( &scheduler.Model{ Meta: &scheduler.MetaData{Name: modelName}, DeploymentSpec: &scheduler.DeploymentSpec{LogPayloads: false}, }, version, "server", - map[int]store.ReplicaStatus{ - 1: {State: store.Loaded}, + map[int32]*db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_Loaded}, }, - false, - store.ModelAvailable, + db.ModelState_ModelAvailable, ) - server := &store.ServerSnapshot{ + server := &db.Server{ Name: "server", - Replicas: map[int]*store.ServerReplica{ - 1: store.NewServerReplica("0.0.0.0", 9000, 9001, 1, store.NewServer("server", false), nil, 100, 100, 0, nil, 100), + Replicas: map[int32]*db.ServerReplica{ + 1: util.NewTestServerReplica("0.0.0.0", 9000, 9001, 1, store.NewServer("server", false), nil, 100, 100, 0, nil, 100), }, } c.AddClustersForRoute(modelRouteName, modelName, httpCluster, grpcCluster, modelVersion.GetVersion(), []int{1}, server) From 16ed20b3a0b69c1f131e84451db540725e6697fd Mon Sep 17 00:00:00 2001 From: Miguel Angel Date: Mon, 12 Jan 2026 17:36:49 +0100 Subject: [PATCH 35/50] add auto mockModelServerAPI instead of our custom mocks (#7086) --- scheduler/pkg/store/pipeline/db_test.go | 100 ++++++++++++++++++-- scheduler/pkg/store/pipeline/status_test.go | 93 ------------------ scheduler/pkg/store/pipeline/store_test.go | 62 ++++++++++-- 3 files changed, 146 insertions(+), 109 deletions(-) diff --git a/scheduler/pkg/store/pipeline/db_test.go b/scheduler/pkg/store/pipeline/db_test.go index 1470c1bc86..2f5b8bdf56 100644 --- a/scheduler/pkg/store/pipeline/db_test.go +++ b/scheduler/pkg/store/pipeline/db_test.go @@ -17,10 +17,11 @@ import ( "github.com/dgraph-io/badger/v3" "github.com/google/go-cmp/cmp" . "github.com/onsi/gomega" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/mock" log "github.com/sirupsen/logrus" + "go.uber.org/mock/gomock" "google.golang.org/protobuf/proto" - - "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" ) func TestSaveWithTTL(t *testing.T) { @@ -88,6 +89,7 @@ func TestSaveAndRestore(t *testing.T) { type test struct { name string pipelines []*Pipeline + setupMock func(m *mock.MockModelServerAPI) } tests := []test{ @@ -120,10 +122,18 @@ func TestSaveAndRestore(t *testing.T) { Deleted: false, }, }, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().GetModel("a").Return(&db.Model{Name: "a", Versions: []*db.ModelVersion{ + { + State: &db.ModelStatus{State: db.ModelState_ModelAvailable}, + }, + }}, nil).MinTimes(1) + }, }, { name: "no pipelines", pipelines: []*Pipeline{}, + setupMock: func(m *mock.MockModelServerAPI) {}, }, { name: "test multiple pipelines", @@ -179,10 +189,26 @@ func TestSaveAndRestore(t *testing.T) { Deleted: false, }, }, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().GetModel("a").Return(&db.Model{Name: "a", Versions: []*db.ModelVersion{ + { + State: &db.ModelStatus{State: db.ModelState_ModelAvailable}, + }, + }}, nil).MinTimes(1) + m.EXPECT().GetModel("b").Return(&db.Model{Name: "b", Versions: []*db.ModelVersion{ + { + State: &db.ModelStatus{State: db.ModelState_ModelTerminating}, + }, + }}, nil).MinTimes(1) + }, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + mockModelServerAPI := mock.NewMockModelServerAPI(ctrl) + test.setupMock(mockModelServerAPI) + path := fmt.Sprintf("%s/db", t.TempDir()) logger := log.New() db, err := newPipelineDbManager(getPipelineDbFolder(path), logger, 10) @@ -194,7 +220,7 @@ func TestSaveAndRestore(t *testing.T) { err = db.Stop() g.Expect(err).To(BeNil()) - ps := NewPipelineStore(log.New(), nil, fakeModelStore{status: map[string]store.ModelState{}}) + ps := NewPipelineStore(log.New(), nil, mockModelServerAPI) err = ps.InitialiseOrRestoreDB(path, 10) g.Expect(err).To(BeNil()) for _, p := range test.pipelines { @@ -207,9 +233,10 @@ func TestSaveAndRestore(t *testing.T) { func TestSaveAndRestoreDeletedPipelines(t *testing.T) { g := NewGomegaWithT(t) type test struct { - name string - pipeline Pipeline - withTTL bool + name string + pipeline Pipeline + withTTL bool + setupMock func(m *mock.MockModelServerAPI) } createdDeletedPipeline := func(name string) Pipeline { @@ -244,15 +271,33 @@ func TestSaveAndRestoreDeletedPipelines(t *testing.T) { name: "test deleted pipeline with TTL should have deletedAt set", pipeline: createdDeletedPipeline("with-ttl"), withTTL: true, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().GetModel("a").Return(&db.Model{Name: "a", Versions: []*db.ModelVersion{ + { + State: &db.ModelStatus{State: db.ModelState_ModelAvailable}, + }, + }}, nil).MinTimes(1) + }, }, { name: "test deleted pipeline without TTL should have deletedAt set after cleanup", pipeline: createdDeletedPipeline("without-ttl"), withTTL: false, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().GetModel("a").Return(&db.Model{Name: "a", Versions: []*db.ModelVersion{ + { + State: &db.ModelStatus{State: db.ModelState_ModelAvailable}, + }, + }}, nil).MinTimes(1) + }, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + mockModelServerAPI := mock.NewMockModelServerAPI(ctrl) + test.setupMock(mockModelServerAPI) + g.Expect(test.pipeline.Deleted).To(BeTrue(), "this is a test for deleted pipelines") path := fmt.Sprintf("%s/db", t.TempDir()) logger := log.New() @@ -268,7 +313,7 @@ func TestSaveAndRestoreDeletedPipelines(t *testing.T) { err = pdb.Stop() g.Expect(err).To(BeNil()) - ps := NewPipelineStore(log.New(), nil, fakeModelStore{status: map[string]store.ModelState{}}) + ps := NewPipelineStore(log.New(), nil, mockModelServerAPI) err = ps.InitialiseOrRestoreDB(path, 10) g.Expect(err).To(BeNil()) @@ -448,6 +493,7 @@ func TestMigrateFromV1ToV2(t *testing.T) { type test struct { name string pipelines []*Pipeline + setupMock func(m *mock.MockModelServerAPI) } tests := []test{ @@ -480,10 +526,18 @@ func TestMigrateFromV1ToV2(t *testing.T) { Deleted: false, }, }, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().GetModel("a").Return(&db.Model{Name: "a", Versions: []*db.ModelVersion{ + { + State: &db.ModelStatus{State: db.ModelState_ModelAvailable}, + }, + }}, nil).MinTimes(1) + }, }, { name: "no pipelines", pipelines: []*Pipeline{}, + setupMock: func(m *mock.MockModelServerAPI) {}, }, { name: "test multiple pipelines", @@ -539,12 +593,28 @@ func TestMigrateFromV1ToV2(t *testing.T) { Deleted: false, }, }, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().GetModel("a").Return(&db.Model{Name: "a", Versions: []*db.ModelVersion{ + { + State: &db.ModelStatus{State: db.ModelState_ModelAvailable}, + }, + }}, nil).MinTimes(1) + m.EXPECT().GetModel("b").Return(&db.Model{Name: "b", Versions: []*db.ModelVersion{ + { + State: &db.ModelStatus{State: db.ModelState_ModelAvailable}, + }, + }}, nil).MinTimes(1) + }, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + mockModelServerAPI := mock.NewMockModelServerAPI(ctrl) + test.setupMock(mockModelServerAPI) + path := fmt.Sprintf("%s/db", t.TempDir()) - ps := NewPipelineStore(log.New(), nil, fakeModelStore{status: map[string]store.ModelState{}}) + ps := NewPipelineStore(log.New(), nil, mockModelServerAPI) err := ps.InitialiseOrRestoreDB(path, 10) g.Expect(err).To(BeNil()) for _, p := range test.pipelines { @@ -576,6 +646,7 @@ func TestMigrateToCore210(t *testing.T) { name string pipelines []*Pipeline expected []*Pipeline + setupMock func(m *mock.MockModelServerAPI) } timestamp := time.Now().UTC() @@ -635,13 +706,24 @@ func TestMigrateToCore210(t *testing.T) { Deleted: false, }, }, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().GetModel("a").Return(&db.Model{Name: "a", Versions: []*db.ModelVersion{ + { + State: &db.ModelStatus{State: db.ModelState_ModelAvailable}, + }, + }}, nil).MinTimes(1) + }, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + mockModelServerAPI := mock.NewMockModelServerAPI(ctrl) + test.setupMock(mockModelServerAPI) + path := fmt.Sprintf("%s/db", t.TempDir()) - ps := NewPipelineStore(log.New(), nil, fakeModelStore{status: map[string]store.ModelState{}}) + ps := NewPipelineStore(log.New(), nil, mockModelServerAPI) err := ps.InitialiseOrRestoreDB(path, 10) g.Expect(err).To(BeNil()) for _, p := range test.pipelines { diff --git a/scheduler/pkg/store/pipeline/status_test.go b/scheduler/pkg/store/pipeline/status_test.go index 801aad0336..9a466e6ff2 100644 --- a/scheduler/pkg/store/pipeline/status_test.go +++ b/scheduler/pkg/store/pipeline/status_test.go @@ -14,101 +14,8 @@ import ( . "github.com/onsi/gomega" "github.com/sirupsen/logrus" - - "github.com/seldonio/seldon-core/apis/go/v2/mlops/agent" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" - - "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" ) -type fakeModelStore struct { - status map[string]store.ModelState -} - -var _ store.ModelServerAPI = (*fakeModelStore)(nil) - -func (f fakeModelStore) UpdateModel(config *scheduler.LoadModelRequest) error { - panic("implement me") -} - -func (f fakeModelStore) GetModel(key string) (*store.ModelSnapshot, error) { - return &store.ModelSnapshot{ - Name: key, - Versions: []*store.ModelVersion{ - store.NewModelVersion(nil, 1, "server", nil, false, f.status[key]), - }, - }, nil -} - -func (f fakeModelStore) GetModels() ([]*store.ModelSnapshot, error) { - panic("implement me") -} - -func (f fakeModelStore) LockModel(modelId string) { - panic("implement me") -} - -func (f fakeModelStore) UnlockModel(modelId string) { - panic("implement me") -} - -func (f fakeModelStore) RemoveModel(req *scheduler.UnloadModelRequest) error { - panic("implement me") -} - -func (f fakeModelStore) GetServers(shallow bool, modelDetails bool) ([]*store.ServerSnapshot, error) { - panic("implement me") -} - -func (f fakeModelStore) GetServer(serverKey string, shallow bool, modelDetails bool) (*store.ServerSnapshot, error) { - panic("implement me") -} - -func (f fakeModelStore) UpdateLoadedModels(modelKey string, version uint32, serverKey string, replicas []*store.ServerReplica) error { - panic("implement me") -} - -func (f fakeModelStore) UnloadVersionModels(modelKey string, version uint32) (bool, error) { - panic("implement me") -} - -func (f fakeModelStore) UnloadModelGwVersionModels(modelKey string, version uint32) (bool, error) { - panic("implement me") -} - -func (f fakeModelStore) UpdateModelState(modelKey string, version uint32, serverKey string, replicaIdx int, availableMemory *uint64, expectedState, desiredState store.ModelReplicaState, reason string, runtimeInfo *scheduler.ModelRuntimeInfo) error { - panic("implement me") -} - -func (f fakeModelStore) AddServerReplica(request *agent.AgentSubscribeRequest) error { - panic("implement me") -} - -func (f fakeModelStore) ServerNotify(request *scheduler.ServerNotify) error { - panic("implement me") -} - -func (f fakeModelStore) RemoveServerReplica(serverName string, replicaIdx int) ([]string, error) { - panic("implement me") -} - -func (f fakeModelStore) FailedScheduling(modelID string, version uint32, reason string, reset bool) error { - panic("implement me") -} - -func (f fakeModelStore) GetAllModels() []string { - panic("implement me") -} - -func (f fakeModelStore) DrainServerReplica(serverName string, replicaIdx int) ([]string, error) { - // TODO implement me - panic("implement me") -} - -func (f fakeModelStore) SetModelGwModelState(name string, versionNumber uint32, status store.ModelState, reason string, source string) error { - panic("implement me") -} - func TestUpdatePipelineModelAvailable(t *testing.T) { g := NewGomegaWithT(t) type test struct { diff --git a/scheduler/pkg/store/pipeline/store_test.go b/scheduler/pkg/store/pipeline/store_test.go index 95e7b1d728..ae7cd5cf77 100644 --- a/scheduler/pkg/store/pipeline/store_test.go +++ b/scheduler/pkg/store/pipeline/store_test.go @@ -15,12 +15,14 @@ import ( "time" . "github.com/onsi/gomega" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/mock" "github.com/sirupsen/logrus" + "go.uber.org/mock/gomock" "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" - "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" ) func TestGetPipelinesPipelineGwStatus(t *testing.T) { @@ -343,10 +345,13 @@ func TestAddPipeline(t *testing.T) { name string proto *scheduler.Pipeline store *PipelineStore + setupMock func(m *mock.MockModelServerAPI) expectedVersion uint32 err error } + ctrl := gomock.NewController(t) + tests := []test{ { name: "add pipeline none existing", @@ -367,9 +372,16 @@ func TestAddPipeline(t *testing.T) { pipelines: map[string]*Pipeline{}, modelStatusHandler: ModelStatusHandler{ modelReferences: map[string]map[string]void{}, - store: fakeModelStore{status: map[string]store.ModelState{}}, + store: mock.NewMockModelServerAPI(ctrl), }, }, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().GetModel("step1").Return(&db.Model{Name: "step1", Versions: []*db.ModelVersion{ + { + State: &db.ModelStatus{State: db.ModelState_ModelAvailable}, + }, + }}, nil).MinTimes(1) + }, expectedVersion: 1, }, { @@ -395,9 +407,16 @@ func TestAddPipeline(t *testing.T) { pipelines: map[string]*Pipeline{}, modelStatusHandler: ModelStatusHandler{ modelReferences: map[string]map[string]void{}, - store: fakeModelStore{status: map[string]store.ModelState{}}, + store: mock.NewMockModelServerAPI(ctrl), }, }, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().GetModel("step1").Return(&db.Model{Name: "step1", Versions: []*db.ModelVersion{ + { + State: &db.ModelStatus{State: db.ModelState_ModelAvailable}, + }, + }}, nil).MinTimes(1) + }, expectedVersion: 1, }, { @@ -433,9 +452,16 @@ func TestAddPipeline(t *testing.T) { }, modelStatusHandler: ModelStatusHandler{ modelReferences: map[string]map[string]void{}, - store: fakeModelStore{status: map[string]store.ModelState{}}, + store: mock.NewMockModelServerAPI(ctrl), }, }, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().GetModel("step1").Return(&db.Model{Name: "step1", Versions: []*db.ModelVersion{ + { + State: &db.ModelStatus{State: db.ModelState_ModelAvailable}, + }, + }}, nil).MinTimes(1) + }, expectedVersion: 2, }, { @@ -469,9 +495,16 @@ func TestAddPipeline(t *testing.T) { }, modelStatusHandler: ModelStatusHandler{ modelReferences: map[string]map[string]void{}, - store: fakeModelStore{status: map[string]store.ModelState{}}, + store: mock.NewMockModelServerAPI(ctrl), }, }, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().GetModel("step1").Return(&db.Model{Name: "step1", Versions: []*db.ModelVersion{ + { + State: &db.ModelStatus{State: db.ModelState_ModelAvailable}, + }, + }}, nil).MinTimes(1) + }, expectedVersion: 1, }, { @@ -504,9 +537,16 @@ func TestAddPipeline(t *testing.T) { }, modelStatusHandler: ModelStatusHandler{ modelReferences: map[string]map[string]void{}, - store: fakeModelStore{status: map[string]store.ModelState{}}, + store: mock.NewMockModelServerAPI(ctrl), }, }, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().GetModel("step1").Return(&db.Model{Name: "step1", Versions: []*db.ModelVersion{ + { + State: &db.ModelStatus{State: db.ModelState_ModelAvailable}, + }, + }}, nil).MinTimes(1) + }, expectedVersion: 1, }, { @@ -539,15 +579,23 @@ func TestAddPipeline(t *testing.T) { }, modelStatusHandler: ModelStatusHandler{ modelReferences: map[string]map[string]void{}, - store: fakeModelStore{status: map[string]store.ModelState{}}, + store: mock.NewMockModelServerAPI(ctrl), }, }, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().GetModel("step1").Return(&db.Model{Name: "step1", Versions: []*db.ModelVersion{ + { + State: &db.ModelStatus{State: db.ModelState_ModelAvailable}, + }, + }}, nil).MinTimes(1) + }, expectedVersion: 1, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { + test.setupMock(test.store.modelStatusHandler.store.(*mock.MockModelServerAPI)) logger := logrus.New() path := fmt.Sprintf("%s/db", t.TempDir()) db, _ := newPipelineDbManager(getPipelineDbFolder(path), logger, 10) From 317ea2ad523d7d660b49f80094fd88a7fa513658 Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Mon, 12 Jan 2026 18:10:09 +0000 Subject: [PATCH 36/50] agent test fixes --- scheduler/pkg/agent/server_test.go | 803 ++++++++++------------ scheduler/pkg/scheduler/interface.go | 1 + scheduler/pkg/scheduler/mock/interface.go | 68 ++ 3 files changed, 449 insertions(+), 423 deletions(-) create mode 100644 scheduler/pkg/scheduler/mock/interface.go diff --git a/scheduler/pkg/agent/server_test.go b/scheduler/pkg/agent/server_test.go index 7336338f03..5d787e75f9 100644 --- a/scheduler/pkg/agent/server_test.go +++ b/scheduler/pkg/agent/server_test.go @@ -17,131 +17,24 @@ import ( "time" . "github.com/onsi/gomega" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/internal/testing_utils" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/scheduler/mock" log "github.com/sirupsen/logrus" + "go.uber.org/mock/gomock" "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" + "google.golang.org/protobuf/types/known/timestamppb" pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/agent" pbs "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" "github.com/seldonio/seldon-core/components/tls/v2/pkg/tls" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" - testing_utils "github.com/seldonio/seldon-core/scheduler/v2/pkg/internal/testing_utils" - "github.com/seldonio/seldon-core/scheduler/v2/pkg/scheduler" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" "github.com/seldonio/seldon-core/scheduler/v2/pkg/util" ) -type mockScheduler struct{} - -var _ scheduler.Scheduler = (*mockScheduler)(nil) - -func (s mockScheduler) Schedule(_ string) error { - return nil -} - -func (s mockScheduler) ScheduleFailedModels() ([]string, error) { - return nil, nil -} - -type mockStore struct { - models map[string]*store.ModelSnapshot -} - -var _ store.ModelServerAPI = (*mockStore)(nil) - -func (m *mockStore) FailedScheduling(modelID string, version uint32, reason string, reset bool) error { - return nil -} - -func (m *mockStore) UpdateModel(config *pbs.LoadModelRequest) error { - panic("implement me") -} - -func (m *mockStore) GetModel(key string) (*store.ModelSnapshot, error) { - return m.models[key], nil -} - -func (f mockStore) GetModels() ([]*store.ModelSnapshot, error) { - models := []*store.ModelSnapshot{} - for _, m := range f.models { - models = append(models, m) - } - return models, nil -} - -func (m mockStore) LockModel(modelId string) { -} - -func (m mockStore) UnlockModel(modelId string) { -} - -func (m *mockStore) RemoveModel(req *pbs.UnloadModelRequest) error { - panic("implement me") -} - -func (m *mockStore) GetServers(shallow bool, modelDetails bool) ([]*store.ServerSnapshot, error) { - panic("implement me") -} - -func (m *mockStore) GetServer(serverKey string, shallow bool, modelDetails bool) (*store.ServerSnapshot, error) { - panic("implement me") -} - -func (m *mockStore) AddNewModelVersion(modelName string) error { - panic("implement me") -} - -func (m *mockStore) UpdateLoadedModels(modelKey string, version uint32, serverKey string, replicas []*store.ServerReplica) error { - panic("implement me") -} - -func (m *mockStore) UnloadVersionModels(modelKey string, version uint32) (bool, error) { - panic("implement me") -} - -func (m *mockStore) UnloadModelGwVersionModels(modelKey string, version uint32) (bool, error) { - panic("implement me") -} - -func (m *mockStore) UpdateModelState(modelKey string, version uint32, serverKey string, replicaIdx int, availableMemory *uint64, expectedState, desiredState store.ModelReplicaState, reason string, runtimeInfo *pbs.ModelRuntimeInfo) error { - model := m.models[modelKey] - for _, mv := range model.Versions { - if mv.GetVersion() == version { - mv.SetReplicaState(replicaIdx, desiredState, reason) - } - } - return nil -} - -func (m *mockStore) AddServerReplica(request *pb.AgentSubscribeRequest) error { - return nil -} - -func (m *mockStore) ServerNotify(request *pbs.ServerNotify) error { - panic("implement me") -} - -func (m *mockStore) RemoveServerReplica(serverName string, replicaIdx int) ([]string, error) { - return nil, nil -} - -func (m *mockStore) DrainServerReplica(serverName string, replicaIdx int) ([]string, error) { - panic("implement me") -} - -func (m *mockStore) GetAllModels() []string { - var modelNames []string - for modelName := range m.models { - modelNames = append(modelNames, modelName) - } - return modelNames -} - -func (m *mockStore) SetModelGwModelState(name string, versionNumber uint32, status store.ModelState, reason string, source string) error { - panic("implement me") -} - type mockGrpcStream struct { err error grpc.ServerStream @@ -164,12 +57,13 @@ func TestSync(t *testing.T) { type ExpectedVersionState struct { version uint32 - expectedStates map[int]store.ReplicaStatus + expectedStates map[int]db.ReplicaStatus } type test struct { name string agents map[ServerKey]*AgentSubscriber - store *mockStore + models []*db.Model + servers []*db.Server modelName string expectedVersionStates []ExpectedVersionState } @@ -180,24 +74,30 @@ func TestSync(t *testing.T) { agents: map[ServerKey]*AgentSubscriber{ {serverName: "server1", replicaIdx: 1}: {stream: &mockGrpcStream{ctx: context.Background()}}, }, - store: &mockStore{ - models: map[string]*store.ModelSnapshot{ - "iris": { - Name: "iris", - Versions: []*store.ModelVersion{ - store.NewModelVersion(&pbs.Model{Meta: &pbs.MetaData{Name: "iris"}}, 1, "server1", - map[int]store.ReplicaStatus{ - 1: {State: store.LoadRequested}, - }, false, store.ModelProgressing), - }, + models: []*db.Model{ + { + Name: "iris", + Versions: []*db.ModelVersion{ + util.NewTestModelVersion(&pbs.Model{Meta: &pbs.MetaData{Name: "iris"}}, 1, "server1", + map[int32]*db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_LoadRequested}, + }, db.ModelState_ModelProgressing), + }, + }, + }, + servers: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 1: {}, }, }, }, expectedVersionStates: []ExpectedVersionState{ { version: 1, - expectedStates: map[int]store.ReplicaStatus{ - 1: {State: store.Loading}, + expectedStates: map[int]db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_Loading}, }, }, }, @@ -208,24 +108,30 @@ func TestSync(t *testing.T) { agents: map[ServerKey]*AgentSubscriber{ {serverName: "server1", replicaIdx: 1}: {stream: &mockGrpcStream{ctx: cancelledCtx}}, }, - store: &mockStore{ - models: map[string]*store.ModelSnapshot{ - "iris": { - Name: "iris", - Versions: []*store.ModelVersion{ - store.NewModelVersion(&pbs.Model{Meta: &pbs.MetaData{Name: "iris"}}, 1, "server1", - map[int]store.ReplicaStatus{ - 1: {State: store.LoadRequested}, - }, false, store.ModelProgressing), - }, + models: []*db.Model{ + { + Name: "iris", + Versions: []*db.ModelVersion{ + util.NewTestModelVersion(&pbs.Model{Meta: &pbs.MetaData{Name: "iris"}}, 1, "server1", + map[int32]*db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_LoadRequested}, + }, db.ModelState_ModelProgressing), + }, + }, + }, + servers: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 1: {}, }, }, }, expectedVersionStates: []ExpectedVersionState{ { version: 1, - expectedStates: map[int]store.ReplicaStatus{ - 1: {State: store.LoadFailed}, + expectedStates: map[int]db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_LoadFailed}, }, }, }, @@ -236,24 +142,30 @@ func TestSync(t *testing.T) { agents: map[ServerKey]*AgentSubscriber{ {serverName: "server1", replicaIdx: 1}: {stream: &mockGrpcStream{ctx: context.Background(), err: fmt.Errorf("error send")}}, }, - store: &mockStore{ - models: map[string]*store.ModelSnapshot{ - "iris": { - Name: "iris", - Versions: []*store.ModelVersion{ - store.NewModelVersion(&pbs.Model{Meta: &pbs.MetaData{Name: "iris"}}, 1, "server1", - map[int]store.ReplicaStatus{ - 1: {State: store.LoadRequested}, - }, false, store.ModelProgressing), - }, + models: []*db.Model{ + { + Name: "iris", + Versions: []*db.ModelVersion{ + util.NewTestModelVersion(&pbs.Model{Meta: &pbs.MetaData{Name: "iris"}}, 1, "server1", + map[int32]*db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_LoadRequested}, + }, db.ModelState_ModelProgressing), + }, + }, + }, + servers: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 1: {}, }, }, }, expectedVersionStates: []ExpectedVersionState{ { version: 1, - expectedStates: map[int]store.ReplicaStatus{ - 1: {State: store.LoadFailed}, + expectedStates: map[int]db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_LoadFailed}, }, }, }, @@ -264,24 +176,30 @@ func TestSync(t *testing.T) { agents: map[ServerKey]*AgentSubscriber{ {serverName: "server1", replicaIdx: 1}: {stream: &mockGrpcStream{ctx: context.Background()}}, }, - store: &mockStore{ - models: map[string]*store.ModelSnapshot{ - "iris": { - Name: "iris", - Versions: []*store.ModelVersion{ - store.NewModelVersion(&pbs.Model{Meta: &pbs.MetaData{Name: "iris"}}, 1, "server1", - map[int]store.ReplicaStatus{ - 1: {State: store.UnloadRequested}, - }, false, store.ModelTerminating), - }, + models: []*db.Model{ + { + Name: "iris", + Versions: []*db.ModelVersion{ + util.NewTestModelVersion(&pbs.Model{Meta: &pbs.MetaData{Name: "iris"}}, 1, "server1", + map[int32]*db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_UnloadRequested}, + }, db.ModelState_ModelTerminating), + }, + }, + }, + servers: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 1: {}, }, }, }, expectedVersionStates: []ExpectedVersionState{ { version: 1, - expectedStates: map[int]store.ReplicaStatus{ - 1: {State: store.Unloading}, + expectedStates: map[int]db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_Unloading}, }, }, }, @@ -292,24 +210,30 @@ func TestSync(t *testing.T) { agents: map[ServerKey]*AgentSubscriber{ {serverName: "server1", replicaIdx: 1}: {stream: &mockGrpcStream{ctx: context.Background(), err: fmt.Errorf("error send")}}, }, - store: &mockStore{ - models: map[string]*store.ModelSnapshot{ - "iris": { - Name: "iris", - Versions: []*store.ModelVersion{ - store.NewModelVersion(&pbs.Model{Meta: &pbs.MetaData{Name: "iris"}}, 1, "server1", - map[int]store.ReplicaStatus{ - 1: {State: store.UnloadRequested}, - }, false, store.ModelTerminating), - }, + models: []*db.Model{ + { + Name: "iris", + Versions: []*db.ModelVersion{ + util.NewTestModelVersion(&pbs.Model{Meta: &pbs.MetaData{Name: "iris"}}, 1, "server1", + map[int32]*db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_UnloadRequested}, + }, db.ModelState_ModelTerminating), + }, + }, + }, + servers: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 1: {}, }, }, }, expectedVersionStates: []ExpectedVersionState{ { version: 1, - expectedStates: map[int]store.ReplicaStatus{ - 1: {State: store.UnloadFailed}, + expectedStates: map[int]db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_UnloadFailed}, }, }, }, @@ -320,34 +244,40 @@ func TestSync(t *testing.T) { agents: map[ServerKey]*AgentSubscriber{ {serverName: "server1", replicaIdx: 1}: {stream: &mockGrpcStream{ctx: context.Background()}}, }, - store: &mockStore{ - models: map[string]*store.ModelSnapshot{ - "iris": { - Name: "iris", - Versions: []*store.ModelVersion{ - store.NewModelVersion(&pbs.Model{Meta: &pbs.MetaData{Name: "iris"}}, 1, "server1", - map[int]store.ReplicaStatus{ - 1: {State: store.UnloadRequested}, - }, false, store.ModelProgressing), - store.NewModelVersion(&pbs.Model{Meta: &pbs.MetaData{Name: "iris"}}, 2, "server1", - map[int]store.ReplicaStatus{ - 1: {State: store.LoadRequested}, - }, false, store.ModelProgressing), - }, + models: []*db.Model{ + { + Name: "iris", + Versions: []*db.ModelVersion{ + util.NewTestModelVersion(&pbs.Model{Meta: &pbs.MetaData{Name: "iris"}}, 1, "server1", + map[int32]*db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_UnloadRequested}, + }, db.ModelState_ModelProgressing), + util.NewTestModelVersion(&pbs.Model{Meta: &pbs.MetaData{Name: "iris"}}, 2, "server1", + map[int32]*db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_LoadRequested}, + }, db.ModelState_ModelProgressing), + }, + }, + }, + servers: []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 1: {}, }, }, }, expectedVersionStates: []ExpectedVersionState{ { version: 1, - expectedStates: map[int]store.ReplicaStatus{ - 1: {State: store.Unloading}, + expectedStates: map[int]db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_Unloading}, }, }, { version: 2, - expectedStates: map[int]store.ReplicaStatus{ - 1: {State: store.Loading}, + expectedStates: map[int]db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_Loading}, }, }, }, @@ -359,10 +289,28 @@ func TestSync(t *testing.T) { logger := log.New() eventHub, err := coordinator.NewEventHub(logger) g.Expect(err).To(BeNil()) - server := NewAgentServer(logger, test.store, nil, eventHub, false, tls.TLSOptions{}) + + // Create storage instances + modelStorage := store.NewInMemoryStorage[*db.Model]() + serverStorage := store.NewInMemoryStorage[*db.Server]() + + // Populate storage with test data + for _, model := range test.models { + err := modelStorage.Insert(context.TODO(), model) + g.Expect(err).To(BeNil()) + } + for _, server := range test.servers { + err := serverStorage.Insert(context.TODO(), server) + g.Expect(err).To(BeNil()) + } + + // Create MemoryStore with populated storage + ms := store.NewModelServerStore(logger, modelStorage, serverStorage, eventHub) + + server := NewAgentServer(logger, ms, nil, eventHub, false, tls.TLSOptions{}) server.agents = test.agents server.Sync(test.modelName) - model, err := test.store.GetModel(test.modelName) + model, err := modelStorage.Get(context.TODO(), test.modelName) g.Expect(err).To(BeNil()) for _, expectedVersionState := range test.expectedVersionStates { mv := model.GetVersion(expectedVersionState.version) @@ -456,7 +404,7 @@ func TestModelScalingProtos(t *testing.T) { type test struct { name string - store *mockStore + models []*db.Model trigger pb.ModelScalingTriggerMessage_Trigger triggerModelName string triggerModelVersion int @@ -467,21 +415,19 @@ func TestModelScalingProtos(t *testing.T) { tests := []test{ { name: "scale up not enabled", - store: &mockStore{ - models: map[string]*store.ModelSnapshot{ - "iris": { - Name: "iris", - Versions: []*store.ModelVersion{ - store.NewModelVersion( - &pbs.Model{ - Meta: &pbs.MetaData{Name: "iris"}, - DeploymentSpec: &pbs.DeploymentSpec{Replicas: 1}, - }, - 1, "server1", - map[int]store.ReplicaStatus{ - 1: {State: store.Available}, - }, false, store.ModelAvailable), - }, + models: []*db.Model{ + { + Name: "iris", + Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "iris"}, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 1}, + }, + 1, "server1", + map[int32]*db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_Available}, + }, db.ModelState_ModelAvailable), }, }, }, @@ -493,21 +439,19 @@ func TestModelScalingProtos(t *testing.T) { }, { name: "scale up within range no max", - store: &mockStore{ - models: map[string]*store.ModelSnapshot{ - "iris": { - Name: "iris", - Versions: []*store.ModelVersion{ - store.NewModelVersion( - &pbs.Model{ - Meta: &pbs.MetaData{Name: "iris"}, - DeploymentSpec: &pbs.DeploymentSpec{Replicas: 1, MinReplicas: 1}, - }, - 1, "server1", - map[int]store.ReplicaStatus{ - 1: {State: store.Available}, - }, false, store.ModelAvailable), - }, + models: []*db.Model{ + { + Name: "iris", + Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "iris"}, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 1, MinReplicas: 1}, + }, + 1, "server1", + map[int32]*db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_Available}, + }, db.ModelState_ModelAvailable), }, }, }, @@ -519,21 +463,19 @@ func TestModelScalingProtos(t *testing.T) { }, { name: "scale up within range", - store: &mockStore{ - models: map[string]*store.ModelSnapshot{ - "iris": { - Name: "iris", - Versions: []*store.ModelVersion{ - store.NewModelVersion( - &pbs.Model{ - Meta: &pbs.MetaData{Name: "iris"}, - DeploymentSpec: &pbs.DeploymentSpec{Replicas: 1, MinReplicas: 1, MaxReplicas: 2}, - }, - 1, "server1", - map[int]store.ReplicaStatus{ - 1: {State: store.Available}, - }, false, store.ModelAvailable), - }, + models: []*db.Model{ + { + Name: "iris", + Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "iris"}, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 1, MinReplicas: 1, MaxReplicas: 2}, + }, + 1, "server1", + map[int32]*db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_Available}, + }, db.ModelState_ModelAvailable), }, }, }, @@ -545,21 +487,19 @@ func TestModelScalingProtos(t *testing.T) { }, { name: "scale up not within range", - store: &mockStore{ - models: map[string]*store.ModelSnapshot{ - "iris": { - Name: "iris", - Versions: []*store.ModelVersion{ - store.NewModelVersion( - &pbs.Model{ - Meta: &pbs.MetaData{Name: "iris"}, - DeploymentSpec: &pbs.DeploymentSpec{Replicas: 1, MinReplicas: 1, MaxReplicas: 1}, - }, - 1, "server1", - map[int]store.ReplicaStatus{ - 1: {State: store.Available}, - }, false, store.ModelAvailable), - }, + models: []*db.Model{ + { + Name: "iris", + Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "iris"}, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 1, MinReplicas: 1, MaxReplicas: 1}, + }, + 1, "server1", + map[int32]*db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_Available}, + }, db.ModelState_ModelAvailable), }, }, }, @@ -571,21 +511,19 @@ func TestModelScalingProtos(t *testing.T) { }, { name: "scale down within range", - store: &mockStore{ - models: map[string]*store.ModelSnapshot{ - "iris": { - Name: "iris", - Versions: []*store.ModelVersion{ - store.NewModelVersion( - &pbs.Model{ - Meta: &pbs.MetaData{Name: "iris"}, - DeploymentSpec: &pbs.DeploymentSpec{Replicas: 2, MinReplicas: 1, MaxReplicas: 2}, - }, - 1, "server1", - map[int]store.ReplicaStatus{ - 1: {State: store.Available}, 2: {State: store.Available}, - }, false, store.ModelAvailable), - }, + models: []*db.Model{ + { + Name: "iris", + Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "iris"}, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 2, MinReplicas: 1, MaxReplicas: 2}, + }, + 1, "server1", + map[int32]*db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_Available}, 2: {State: db.ModelReplicaState_Available}, + }, db.ModelState_ModelAvailable), }, }, }, @@ -597,21 +535,19 @@ func TestModelScalingProtos(t *testing.T) { }, { name: "scale down not within range", - store: &mockStore{ - models: map[string]*store.ModelSnapshot{ - "iris": { - Name: "iris", - Versions: []*store.ModelVersion{ - store.NewModelVersion( - &pbs.Model{ - Meta: &pbs.MetaData{Name: "iris"}, - DeploymentSpec: &pbs.DeploymentSpec{Replicas: 2, MinReplicas: 2, MaxReplicas: 3}, - }, - 1, "server1", - map[int]store.ReplicaStatus{ - 1: {State: store.Available}, 2: {State: store.Available}, - }, false, store.ModelAvailable), - }, + models: []*db.Model{ + { + Name: "iris", + Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "iris"}, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 2, MinReplicas: 2, MaxReplicas: 3}, + }, + 1, "server1", + map[int32]*db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_Available}, 2: {State: db.ModelReplicaState_Available}, + }, db.ModelState_ModelAvailable), }, }, }, @@ -623,21 +559,19 @@ func TestModelScalingProtos(t *testing.T) { }, { name: "scale down not enabled", - store: &mockStore{ - models: map[string]*store.ModelSnapshot{ - "iris": { - Name: "iris", - Versions: []*store.ModelVersion{ - store.NewModelVersion( - &pbs.Model{ - Meta: &pbs.MetaData{Name: "iris"}, - DeploymentSpec: &pbs.DeploymentSpec{Replicas: 2}, - }, - 1, "server1", - map[int]store.ReplicaStatus{ - 1: {State: store.Available}, 2: {State: store.Available}, - }, false, store.ModelAvailable), - }, + models: []*db.Model{ + { + Name: "iris", + Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "iris"}, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 2}, + }, + 1, "server1", + map[int32]*db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_Available}, 2: {State: db.ModelReplicaState_Available}, + }, db.ModelState_ModelAvailable), }, }, }, @@ -649,21 +583,19 @@ func TestModelScalingProtos(t *testing.T) { }, { name: "model not stable, scale down - should not proceed", - store: &mockStore{ - models: map[string]*store.ModelSnapshot{ - "iris": { - Name: "iris", - Versions: []*store.ModelVersion{ - store.NewModelVersion( - &pbs.Model{ - Meta: &pbs.MetaData{Name: "iris"}, - DeploymentSpec: &pbs.DeploymentSpec{Replicas: 2, MinReplicas: 1}, - }, - 1, "server1", - map[int]store.ReplicaStatus{ - 1: {State: store.Available}, 2: {State: store.Available}, - }, false, store.ModelAvailable), - }, + models: []*db.Model{ + { + Name: "iris", + Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "iris"}, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 2, MinReplicas: 1}, + }, + 1, "server1", + map[int32]*db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_Available}, 2: {State: db.ModelReplicaState_Available}, + }, db.ModelState_ModelAvailable), }, }, }, @@ -676,21 +608,19 @@ func TestModelScalingProtos(t *testing.T) { }, { name: "model not stable, scale up - should proceed", - store: &mockStore{ - models: map[string]*store.ModelSnapshot{ - "iris": { - Name: "iris", - Versions: []*store.ModelVersion{ - store.NewModelVersion( - &pbs.Model{ - Meta: &pbs.MetaData{Name: "iris"}, - DeploymentSpec: &pbs.DeploymentSpec{Replicas: 2, MinReplicas: 1}, - }, - 1, "server1", - map[int]store.ReplicaStatus{ - 1: {State: store.Available}, 2: {State: store.Available}, - }, false, store.ModelAvailable), - }, + models: []*db.Model{ + { + Name: "iris", + Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "iris"}, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 2, MinReplicas: 1}, + }, + 1, "server1", + map[int32]*db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_Available}, 2: {State: db.ModelReplicaState_Available}, + }, db.ModelState_ModelAvailable), }, }, }, @@ -703,21 +633,19 @@ func TestModelScalingProtos(t *testing.T) { }, { name: "model not available, scale up - should not proceed", - store: &mockStore{ - models: map[string]*store.ModelSnapshot{ - "iris": { - Name: "iris", - Versions: []*store.ModelVersion{ - store.NewModelVersion( - &pbs.Model{ - Meta: &pbs.MetaData{Name: "iris"}, - DeploymentSpec: &pbs.DeploymentSpec{Replicas: 2, MinReplicas: 1}, - }, - 1, "server1", - map[int]store.ReplicaStatus{ - 1: {State: store.Available}, 2: {State: store.LoadFailed}, - }, false, store.ScheduleFailed), - }, + models: []*db.Model{ + { + Name: "iris", + Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "iris"}, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 2, MinReplicas: 1}, + }, + 1, "server1", + map[int32]*db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_Available}, 2: {State: db.ModelReplicaState_LoadFailed}, + }, db.ModelState_ScheduleFailed), }, }, }, @@ -729,21 +657,19 @@ func TestModelScalingProtos(t *testing.T) { }, { name: "model not available, scale down - should proceed", - store: &mockStore{ - models: map[string]*store.ModelSnapshot{ - "iris": { - Name: "iris", - Versions: []*store.ModelVersion{ - store.NewModelVersion( - &pbs.Model{ - Meta: &pbs.MetaData{Name: "iris"}, - DeploymentSpec: &pbs.DeploymentSpec{Replicas: 2, MinReplicas: 1}, - }, - 1, "server1", - map[int]store.ReplicaStatus{ - 1: {State: store.Available}, 2: {State: store.LoadFailed}, - }, false, store.ScheduleFailed), - }, + models: []*db.Model{ + { + Name: "iris", + Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "iris"}, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 2, MinReplicas: 1}, + }, + 1, "server1", + map[int32]*db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_Available}, 2: {State: db.ModelReplicaState_LoadFailed}, + }, db.ModelState_ScheduleFailed), }, }, }, @@ -755,30 +681,28 @@ func TestModelScalingProtos(t *testing.T) { }, { name: "model available is not latest, scale up - should not proceed", - store: &mockStore{ - models: map[string]*store.ModelSnapshot{ - "iris": { - Name: "iris", - Versions: []*store.ModelVersion{ - store.NewModelVersion( - &pbs.Model{ - Meta: &pbs.MetaData{Name: "iris"}, - DeploymentSpec: &pbs.DeploymentSpec{Replicas: 2, MinReplicas: 1}, - }, - 1, "server1", - map[int]store.ReplicaStatus{ - 1: {State: store.Available}, 2: {State: store.Available}, - }, false, store.ModelAvailable), - store.NewModelVersion( - &pbs.Model{ - Meta: &pbs.MetaData{Name: "iris"}, - DeploymentSpec: &pbs.DeploymentSpec{Replicas: 2, MinReplicas: 1}, - }, - 2, "server1", - map[int]store.ReplicaStatus{ - 1: {State: store.Available}, 2: {State: store.Loading}, - }, false, store.ModelProgressing), - }, + models: []*db.Model{ + { + Name: "iris", + Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "iris"}, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 2, MinReplicas: 1}, + }, + 1, "server1", + map[int32]*db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_Available}, 2: {State: db.ModelReplicaState_Available}, + }, db.ModelState_ModelAvailable), + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "iris"}, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 2, MinReplicas: 1}, + }, + 2, "server1", + map[int32]*db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_Available}, 2: {State: db.ModelReplicaState_Loading}, + }, db.ModelState_ModelProgressing), }, }, }, @@ -791,30 +715,28 @@ func TestModelScalingProtos(t *testing.T) { }, { name: "model versions mismatch - should not proceed", - store: &mockStore{ - models: map[string]*store.ModelSnapshot{ - "iris": { - Name: "iris", - Versions: []*store.ModelVersion{ - store.NewModelVersion( - &pbs.Model{ - Meta: &pbs.MetaData{Name: "iris"}, - DeploymentSpec: &pbs.DeploymentSpec{Replicas: 2, MinReplicas: 1}, - }, - 1, "server1", - map[int]store.ReplicaStatus{ - 1: {State: store.Available}, 2: {State: store.Available}, - }, false, store.ModelAvailable), - store.NewModelVersion( - &pbs.Model{ - Meta: &pbs.MetaData{Name: "iris"}, - DeploymentSpec: &pbs.DeploymentSpec{Replicas: 2, MinReplicas: 1}, - }, - 2, "server1", - map[int]store.ReplicaStatus{ - 1: {State: store.Available}, 2: {State: store.Available}, - }, false, store.ModelState(store.Available)), - }, + models: []*db.Model{ + { + Name: "iris", + Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "iris"}, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 2, MinReplicas: 1}, + }, + 1, "server1", + map[int32]*db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_Available}, 2: {State: db.ModelReplicaState_Available}, + }, db.ModelState_ModelAvailable), + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "iris"}, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 2, MinReplicas: 1}, + }, + 2, "server1", + map[int32]*db.ReplicaStatus{ + 1: {State: db.ModelReplicaState_Available}, 2: {State: db.ModelReplicaState_Available}, + }, db.ModelState_ModelAvailable), }, }, }, @@ -825,10 +747,7 @@ func TestModelScalingProtos(t *testing.T) { isError: true, }, { - name: "model does not exist in scheduler state", - store: &mockStore{ - models: map[string]*store.ModelSnapshot{}, - }, + name: "model does not exist in scheduler state", trigger: pb.ModelScalingTriggerMessage_SCALE_UP, triggerModelName: "iris", triggerModelVersion: 1, @@ -839,14 +758,24 @@ func TestModelScalingProtos(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - model, _ := test.store.GetModel(test.triggerModelName) + + // Create storage instances + modelStorage := store.NewInMemoryStorage[*db.Model]() + + // Populate storage with test data + for _, model := range test.models { + err := modelStorage.Insert(context.TODO(), model) + g.Expect(err).To(BeNil()) + } + + model, _ := modelStorage.Get(context.TODO(), test.triggerModelName) if model != nil { // in the cases where the model is not in the scheduler state yet - lastestModel := model.GetLatest() - state := lastestModel.ModelState() - state.Timestamp = test.lastUpdate - lastestModel.SetModelState(state) + lastestModel := model.Latest() + state := lastestModel.State + state.Timestamp = timestamppb.New(test.lastUpdate) + lastestModel.State = state } else { - model = &store.ModelSnapshot{ + model = &db.Model{ Name: test.triggerModelName, } } @@ -1016,39 +945,57 @@ func TestSubscribe(t *testing.T) { agents []ag expectedAgentsCount int expectedAgentsCountAfterClose int + setupMock func(s *mock.MockScheduler) } tests := []test{ { name: "simple", agents: []ag{ - {1, true}, {2, true}, + {1, true}, + {2, true}, }, expectedAgentsCount: 2, expectedAgentsCountAfterClose: 0, + setupMock: func(s *mock.MockScheduler) { + s.EXPECT().ScheduleFailedModels().Return([]string{}, nil).MinTimes(2) + }, }, { name: "simple - no close", agents: []ag{ - {1, true}, {2, false}, + {1, true}, + {2, false}, }, expectedAgentsCount: 2, expectedAgentsCountAfterClose: 1, + setupMock: func(s *mock.MockScheduler) { + s.EXPECT().ScheduleFailedModels().Return([]string{}, nil).MinTimes(2) + }, }, { name: "duplicates", agents: []ag{ - {1, true}, {1, false}, + {1, true}, + {1, false}, }, expectedAgentsCount: 1, expectedAgentsCountAfterClose: 1, + setupMock: func(s *mock.MockScheduler) { + s.EXPECT().ScheduleFailedModels().Return([]string{}, nil).MinTimes(1) + }, }, { name: "duplicates with all close", agents: []ag{ - {1, true}, {1, true}, {1, true}, + {1, true}, + {1, true}, + {1, true}, }, expectedAgentsCount: 1, expectedAgentsCountAfterClose: 0, + setupMock: func(s *mock.MockScheduler) { + s.EXPECT().ScheduleFailedModels().Return([]string{}, nil).MinTimes(3) + }, }, } @@ -1069,10 +1016,20 @@ func TestSubscribe(t *testing.T) { } for _, test := range tests { t.Run(test.name, func(t *testing.T) { + ctrl := gomock.NewController(t) + mockScheduler := mock.NewMockScheduler(ctrl) + test.setupMock(mockScheduler) + logger := log.New() eventHub, err := coordinator.NewEventHub(logger) g.Expect(err).To(BeNil()) - server := NewAgentServer(logger, &mockStore{}, mockScheduler{}, eventHub, false, tls.TLSOptions{}) + + // Create storage instances + modelStorage := store.NewInMemoryStorage[*db.Model]() + serverStorage := store.NewInMemoryStorage[*db.Server]() + ms := store.NewModelServerStore(logger, modelStorage, serverStorage, eventHub) + + server := NewAgentServer(logger, ms, mockScheduler, eventHub, false, tls.TLSOptions{}) port, err := testing_utils.GetFreePortForTest() if err != nil { t.Fatal(err) diff --git a/scheduler/pkg/scheduler/interface.go b/scheduler/pkg/scheduler/interface.go index 33155e7814..a55aab9806 100644 --- a/scheduler/pkg/scheduler/interface.go +++ b/scheduler/pkg/scheduler/interface.go @@ -9,6 +9,7 @@ the Change License after the Change Date as each is defined in accordance with t package scheduler +//go:generate go tool mockgen -source=./interface.go -destination=./mock/interface.go -package=mock Scheduler type Scheduler interface { Schedule(modelKey string) error ScheduleFailedModels() ([]string, error) diff --git a/scheduler/pkg/scheduler/mock/interface.go b/scheduler/pkg/scheduler/mock/interface.go new file mode 100644 index 0000000000..0149fc822d --- /dev/null +++ b/scheduler/pkg/scheduler/mock/interface.go @@ -0,0 +1,68 @@ +// Code generated by MockGen. DO NOT EDIT. +// Source: ./interface.go +// +// Generated by this command: +// +// mockgen -source=./interface.go -destination=./mock/interface.go -package=mock Scheduler +// + +// Package mock is a generated GoMock package. +package mock + +import ( + reflect "reflect" + + gomock "go.uber.org/mock/gomock" +) + +// MockScheduler is a mock of Scheduler interface. +type MockScheduler struct { + ctrl *gomock.Controller + recorder *MockSchedulerMockRecorder +} + +// MockSchedulerMockRecorder is the mock recorder for MockScheduler. +type MockSchedulerMockRecorder struct { + mock *MockScheduler +} + +// NewMockScheduler creates a new mock instance. +func NewMockScheduler(ctrl *gomock.Controller) *MockScheduler { + mock := &MockScheduler{ctrl: ctrl} + mock.recorder = &MockSchedulerMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockScheduler) EXPECT() *MockSchedulerMockRecorder { + return m.recorder +} + +// Schedule mocks base method. +func (m *MockScheduler) Schedule(modelKey string) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Schedule", modelKey) + ret0, _ := ret[0].(error) + return ret0 +} + +// Schedule indicates an expected call of Schedule. +func (mr *MockSchedulerMockRecorder) Schedule(modelKey any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Schedule", reflect.TypeOf((*MockScheduler)(nil).Schedule), modelKey) +} + +// ScheduleFailedModels mocks base method. +func (m *MockScheduler) ScheduleFailedModels() ([]string, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "ScheduleFailedModels") + ret0, _ := ret[0].([]string) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// ScheduleFailedModels indicates an expected call of ScheduleFailedModels. +func (mr *MockSchedulerMockRecorder) ScheduleFailedModels() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ScheduleFailedModels", reflect.TypeOf((*MockScheduler)(nil).ScheduleFailedModels)) +} From 535208a81a3c2724c285c07c795311cd23083886 Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Mon, 12 Jan 2026 18:36:57 +0000 Subject: [PATCH 37/50] wip: control plane tests --- scheduler/pkg/server/control_plane_test.go | 7 +- scheduler/pkg/server/server_status_test.go | 161 +++++++++++---------- 2 files changed, 87 insertions(+), 81 deletions(-) diff --git a/scheduler/pkg/server/control_plane_test.go b/scheduler/pkg/server/control_plane_test.go index b3eed83971..1fd8372e5a 100644 --- a/scheduler/pkg/server/control_plane_test.go +++ b/scheduler/pkg/server/control_plane_test.go @@ -16,6 +16,7 @@ import ( "time" . "github.com/onsi/gomega" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" log "github.com/sirupsen/logrus" "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" @@ -47,7 +48,7 @@ func TestStartServerStream(t *testing.T) { name: "success - ok", ctx: context.Background(), server: &SchedulerServer{ - modelStore: store.NewModelServerStore(log.New(), store.NewLocalSchedulerStore(), nil), + modelStore: store.NewModelServerStore(log.New(), store.NewInMemoryStorage[*db.Model](), store.NewInMemoryStorage[*db.Server](), nil), logger: log.New(), timeout: 10 * time.Millisecond, }, @@ -56,7 +57,7 @@ func TestStartServerStream(t *testing.T) { name: "failure - stream ctx cancelled", ctx: cancellCtx, server: &SchedulerServer{ - modelStore: store.NewModelServerStore(log.New(), store.NewLocalSchedulerStore(), nil), + modelStore: store.NewModelServerStore(log.New(), store.NewInMemoryStorage[*db.Model](), store.NewInMemoryStorage[*db.Server](), nil), logger: log.New(), timeout: 10 * time.Millisecond, }, @@ -66,7 +67,7 @@ func TestStartServerStream(t *testing.T) { name: "failure - timeout", ctx: context.Background(), server: &SchedulerServer{ - modelStore: store.NewModelServerStore(log.New(), store.NewLocalSchedulerStore(), nil), + modelStore: store.NewModelServerStore(log.New(), store.NewInMemoryStorage[*db.Model](), store.NewInMemoryStorage[*db.Server](), nil), logger: log.New(), timeout: 1 * time.Millisecond, }, diff --git a/scheduler/pkg/server/server_status_test.go b/scheduler/pkg/server/server_status_test.go index cc8cacfea6..c0c2012bc5 100644 --- a/scheduler/pkg/server/server_status_test.go +++ b/scheduler/pkg/server/server_status_test.go @@ -16,6 +16,8 @@ import ( "time" . "github.com/onsi/gomega" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" log "github.com/sirupsen/logrus" "go.uber.org/mock/gomock" "google.golang.org/protobuf/proto" @@ -27,7 +29,6 @@ import ( "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" scheduler2 "github.com/seldonio/seldon-core/scheduler/v2/pkg/scheduler" "github.com/seldonio/seldon-core/scheduler/v2/pkg/scheduler/cleaner" - "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/experiment" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/mock" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/pipeline" @@ -39,22 +40,22 @@ func TestPollerRetryFailedModels(t *testing.T) { tests := []struct { name string funcName string - targetState store.ModelState + targetState db.ModelState operation string modelNames []string - setupMocks func(mockModelStore *mock.MockModelStore, modelNames []string, targetState store.ModelState) + setupMocks func(mockModelStore *mock.MockModelServerAPI, modelNames []string, targetState db.ModelState) contextTimeout time.Duration tickDuration time.Duration - validateMocks func(g *WithT, mockModelStore *mock.MockModelStore) + validateMocks func(g *WithT, mockModelStore *mock.MockModelServerAPI) maxRetries uint }{ { name: "context cancelled immediately", funcName: "testFunc", - targetState: store.ModelFailed, + targetState: db.ModelState_ModelFailed, operation: "create", modelNames: []string{}, - setupMocks: func(mockModelStore *mock.MockModelStore, modelNames []string, targetState store.ModelState) { + setupMocks: func(mockModelStore *mock.MockModelServerAPI, modelNames []string, targetState db.ModelState) { // No expectations - context cancelled before first tick }, contextTimeout: 0, // Cancel immediately @@ -63,10 +64,10 @@ func TestPollerRetryFailedModels(t *testing.T) { { name: "no models exist", funcName: "testFunc", - targetState: store.ModelFailed, + targetState: db.ModelState_ModelFailed, operation: "create", modelNames: []string{}, - setupMocks: func(mockModelStore *mock.MockModelStore, modelNames []string, targetState store.ModelState) { + setupMocks: func(mockModelStore *mock.MockModelServerAPI, modelNames []string, targetState db.ModelState) { mockModelStore.EXPECT(). GetAllModels(). Return([]string{}). @@ -78,22 +79,22 @@ func TestPollerRetryFailedModels(t *testing.T) { { name: "single model not in target state", funcName: "testFunc", - targetState: store.ModelFailed, + targetState: db.ModelState_ModelFailed, operation: "create", modelNames: []string{"model-1"}, - setupMocks: func(mockModelStore *mock.MockModelStore, modelNames []string, targetState store.ModelState) { + setupMocks: func(mockModelStore *mock.MockModelServerAPI, modelNames []string, targetState db.ModelState) { mockModelStore.EXPECT(). GetAllModels(). Return(modelNames). MinTimes(1) - model := &store.ModelSnapshot{} + model := &db.Model{} model.Name = "model-1" - modelVersion := store.NewModelVersion(&pb.Model{}, 1, "server-1", nil, false, 0) - modelVersion.SetModelState(store.ModelStatus{ - ModelGwState: store.ScheduleFailed, - }) - model.Versions = []*store.ModelVersion{modelVersion} + modelVersion := util.NewTestModelVersion(&pb.Model{}, 1, "server-1", nil, 0) + modelVersion.State = &db.ModelStatus{ + ModelGwState: db.ModelState_ScheduleFailed, + } + model.Versions = []*db.ModelVersion{modelVersion} mockModelStore.EXPECT(). GetModel("model-1"). @@ -106,21 +107,21 @@ func TestPollerRetryFailedModels(t *testing.T) { { name: "single model in failed state", funcName: "pollerRetryFailedCreateModels", - targetState: store.ModelFailed, + targetState: db.ModelState_ModelFailed, operation: "create", modelNames: []string{"failed-model"}, - setupMocks: func(mockModelStore *mock.MockModelStore, modelNames []string, targetState store.ModelState) { + setupMocks: func(mockModelStore *mock.MockModelServerAPI, modelNames []string, targetState db.ModelState) { mockModelStore.EXPECT(). GetAllModels(). Return(modelNames).MinTimes(1) - model := &store.ModelSnapshot{} + model := &db.Model{} model.Name = "failed-model" - modelVersion := store.NewModelVersion(&pb.Model{}, 1, "server-1", nil, false, 0) - modelVersion.SetModelState(store.ModelStatus{ - ModelGwState: store.ModelFailed, - }) - model.Versions = []*store.ModelVersion{modelVersion} + modelVersion := util.NewTestModelVersion(&pb.Model{}, 1, "server-1", nil, 0) + modelVersion.State = &db.ModelStatus{ + ModelGwState: db.ModelState_ModelFailed, + } + model.Versions = []*db.ModelVersion{modelVersion} mockModelStore.EXPECT(). GetModel("failed-model"). @@ -130,7 +131,7 @@ func TestPollerRetryFailedModels(t *testing.T) { mockModelStore.EXPECT().SetModelGwModelState( "failed-model", uint32(1), - store.ModelCreate, "No model gateway available to handle model", modelStatusEventSource).MinTimes(1) + db.ModelState_ModelCreate, "No model gateway available to handle model", modelStatusEventSource).MinTimes(1) }, contextTimeout: 100 * time.Millisecond, tickDuration: 50 * time.Millisecond, @@ -139,21 +140,21 @@ func TestPollerRetryFailedModels(t *testing.T) { { name: "max retries exceeded, do not retry", funcName: "pollerRetryFailedCreateModels", - targetState: store.ModelFailed, + targetState: db.ModelState_ModelFailed, operation: "create", modelNames: []string{"failed-model"}, - setupMocks: func(mockModelStore *mock.MockModelStore, modelNames []string, targetState store.ModelState) { + setupMocks: func(mockModelStore *mock.MockModelServerAPI, modelNames []string, targetState db.ModelState) { mockModelStore.EXPECT(). GetAllModels(). Return(modelNames).MinTimes(1) - model := &store.ModelSnapshot{} + model := &db.Model{} model.Name = "failed-model" - modelVersion := store.NewModelVersion(&pb.Model{}, 1, "server-1", nil, false, 0) - modelVersion.SetModelState(store.ModelStatus{ - ModelGwState: store.ModelFailed, - }) - model.Versions = []*store.ModelVersion{modelVersion} + modelVersion := util.NewTestModelVersion(&pb.Model{}, 1, "server-1", nil, 0) + modelVersion.State = &db.ModelStatus{ + ModelGwState: db.ModelState_ModelFailed, + } + model.Versions = []*db.ModelVersion{modelVersion} mockModelStore.EXPECT(). GetModel("failed-model"). @@ -171,7 +172,7 @@ func TestPollerRetryFailedModels(t *testing.T) { g := NewGomegaWithT(t) ctrl := gomock.NewController(t) - mockModelStore := mock.NewMockModelStore(ctrl) + mockModelStore := mock.NewMockModelServerAPI(ctrl) if tt.setupMocks != nil { tt.setupMocks(mockModelStore, tt.modelNames, tt.targetState) @@ -263,7 +264,7 @@ func TestTerminateModelGwVersionModels(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - s, _ := createTestScheduler(t) + s, _ := createTestScheduler() for _, lr := range test.loadReq { err := s.modelStore.UpdateModel(lr) g.Expect(err).To(BeNil()) @@ -277,21 +278,21 @@ func TestTerminateModelGwVersionModels(t *testing.T) { g.Expect(model.Versions).To(HaveLen(2)) // set model-gw status to available - mem, ok := s.modelStore.(*store.TestMemoryStore) + mem, ok := s.modelStore.(*db.TestMemoryStore) g.Expect(ok).To(BeTrue()) - err = mem.DirectlyUpdateModelStatus(store.ModelID{ + err = mem.DirectlyUpdateModelStatus(db.ModelID{ Name: modelName, Version: model.GetLatest().GetVersion(), - }, store.ModelStatus{ - ModelGwState: store.ModelAvailable, + }, db.ModelStatus{ + ModelGwState: db.ModelAvailable, }) g.Expect(err).To(BeNil()) // check if latest version is available model, err = s.modelStore.GetModel(modelName) g.Expect(err).To(BeNil()) - g.Expect(model.GetLatest().ModelState().ModelGwState).To(Equal(store.ModelAvailable)) + g.Expect(model.Latest().State.ModelGwState).To(Equal(db.ModelState_ModelAvailable)) // trigger cleanup clr := cleaner.NewTestVersionCleaner(s.modelStore, s.logger) @@ -304,7 +305,7 @@ func TestTerminateModelGwVersionModels(t *testing.T) { mv := model.GetPrevious() g.Expect(mv).ToNot(BeNil()) - g.Expect(mv.ModelState().ModelGwState).To(Equal(store.ModelTerminated)) + g.Expect(mv.ModelState().ModelGwState).To(Equal(db.ModelTerminated)) }) } @@ -332,7 +333,7 @@ func TestModelsStatusStream(t *testing.T) { }, }, server: &SchedulerServer{ - modelStore: store.NewModelServerStore(log.New(), store.NewLocalSchedulerStore(), nil), + modelStore: db.NewModelServerStore(log.New(), db.NewLocalSchedulerStore(), nil), logger: log.New(), timeout: 10 * time.Millisecond, }, @@ -346,7 +347,7 @@ func TestModelsStatusStream(t *testing.T) { }, }, server: &SchedulerServer{ - modelStore: store.NewModelServerStore(log.New(), store.NewLocalSchedulerStore(), nil), + modelStore: db.NewModelServerStore(log.New(), db.NewLocalSchedulerStore(), nil), logger: log.New(), timeout: 10 * time.Millisecond, }, @@ -361,7 +362,7 @@ func TestModelsStatusStream(t *testing.T) { }, }, server: &SchedulerServer{ - modelStore: store.NewModelServerStore(log.New(), store.NewLocalSchedulerStore(), nil), + modelStore: db.NewModelServerStore(log.New(), db.NewLocalSchedulerStore(), nil), logger: log.New(), timeout: 1 * time.Millisecond, }, @@ -431,7 +432,7 @@ func TestPublishModelsStatusWithTimeout(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - s, hub := createTestScheduler(t) + s, hub := createTestScheduler() s.timeout = test.timeout if test.loadReq != nil { err := s.modelStore.UpdateModel(test.loadReq) @@ -539,9 +540,9 @@ func TestAddAndRemoveModelNoModelGw(t *testing.T) { ms, err := s.modelStore.GetModel(modelName) g.Expect(err).To(BeNil()) - mv := ms.GetLatest() - g.Expect(mv.ModelState().ModelGwState).To(Equal(store.ModelCreate)) - g.Expect(mv.ModelState().ModelGwReason).To(Equal("No model gateway available to handle model")) + mv := ms.Latest() + g.Expect(mv.State.ModelGwState).To(Equal(db.ModelState_ModelCreate)) + g.Expect(mv.State.ModelGwReason).To(Equal("No model gateway available to handle model")) // remove model err = s.modelStore.RemoveModel(test.unloadReq) @@ -623,9 +624,9 @@ func TestModelGwRebalanceNoPipelineGw(t *testing.T) { ms, err := s.modelStore.GetModel(modelName) g.Expect(err).To(BeNil()) - mv := ms.GetLatest() - g.Expect(mv.ModelState().ModelGwState).To(Equal(store.ModelCreate)) - g.Expect(mv.ModelState().ModelGwReason).To(Equal("No model gateway available to handle model")) + mv := ms.Latest() + g.Expect(mv.State.ModelGwState).To(Equal(db.ModelState_ModelCreate)) + g.Expect(mv.State.ModelGwReason).To(Equal("No model gateway available to handle model")) // trigger rebalance s.modelGwRebalance() @@ -647,7 +648,7 @@ func TestModelGwRebalanceCorrectMessages(t *testing.T) { type test struct { name string loadReq *pb.LoadModelRequest - modelGwStatus store.ModelState + modelGwStatus db.ModelState operation pb.ModelStatusResponse_ModelOperation ctx context.Context } @@ -660,7 +661,7 @@ func TestModelGwRebalanceCorrectMessages(t *testing.T) { Meta: &pb.MetaData{Name: "foo"}, }, }, - modelGwStatus: store.ModelAvailable, + modelGwStatus: db.ModelState_ModelAvailable, operation: pb.ModelStatusResponse_ModelCreate, ctx: context.Background(), }, @@ -671,7 +672,7 @@ func TestModelGwRebalanceCorrectMessages(t *testing.T) { Meta: &pb.MetaData{Name: "foo"}, }, }, - modelGwStatus: store.ModelProgressing, + modelGwStatus: db.ModelState_ModelProgressing, operation: pb.ModelStatusResponse_ModelCreate, ctx: context.Background(), }, @@ -682,7 +683,7 @@ func TestModelGwRebalanceCorrectMessages(t *testing.T) { Meta: &pb.MetaData{Name: "foo"}, }, }, - modelGwStatus: store.ModelTerminating, + modelGwStatus: db.ModelState_ModelTerminating, operation: pb.ModelStatusResponse_ModelDelete, ctx: context.Background(), }, @@ -690,7 +691,7 @@ func TestModelGwRebalanceCorrectMessages(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - s, hub := createTestScheduler(t) + s, hub := createTestScheduler() // create operator stream operatorStream := newStubModelStatusServer(1, 5*time.Millisecond, test.ctx) @@ -764,7 +765,7 @@ func TestModelGwRebalanceCorrectMessages(t *testing.T) { s.modelGwRebalance() // check message is received by the operator - if test.modelGwStatus != store.ModelTerminating { + if test.modelGwStatus != db.ModelTerminating { msr = receiveMessageFromModelStream(operatorStream) g.Expect(msr).ToNot(BeNil()) g.Expect(msr.ModelName).To(Equal("foo")) @@ -897,14 +898,14 @@ func TestModelGwRebalance(t *testing.T) { modelName := req.Model.Meta.Name model, _ := s.modelStore.GetModel(modelName) - mem, ok := s.modelStore.(*store.TestMemoryStore) + mem, ok := s.modelStore.(*db.TestMemoryStore) g.Expect(ok).To(BeTrue()) - err = mem.DirectlyUpdateModelStatus(store.ModelID{ + err = mem.DirectlyUpdateModelStatus(db.ModelID{ Name: modelName, Version: model.GetLatest().GetVersion(), - }, store.ModelStatus{ - ModelGwState: store.ModelAvailable, + }, db.ModelStatus{ + ModelGwState: db.ModelAvailable, AvailableReplicas: 1, }) g.Expect(err).To(BeNil()) @@ -975,7 +976,7 @@ func TestServersStatusStream(t *testing.T) { }, }, server: &SchedulerServer{ - modelStore: store.NewModelServerStore(log.New(), store.NewLocalSchedulerStore(), nil), + modelStore: db.NewModelServerStore(log.New(), db.NewLocalSchedulerStore(), nil), logger: log.New(), timeout: 10 * time.Millisecond, }, @@ -1011,7 +1012,7 @@ func TestServersStatusStream(t *testing.T) { }, }, server: &SchedulerServer{ - modelStore: store.NewModelServerStore(log.New(), store.NewLocalSchedulerStore(), nil), + modelStore: db.NewModelServerStore(log.New(), db.NewLocalSchedulerStore(), nil), logger: log.New(), timeout: 10 * time.Millisecond, }, @@ -1048,7 +1049,7 @@ func TestServersStatusStream(t *testing.T) { }, }, server: &SchedulerServer{ - modelStore: store.NewModelServerStore(log.New(), store.NewLocalSchedulerStore(), nil), + modelStore: db.NewModelServerStore(log.New(), db.NewLocalSchedulerStore(), nil), logger: log.New(), timeout: 10 * time.Millisecond, }, @@ -1063,7 +1064,7 @@ func TestServersStatusStream(t *testing.T) { }, }, server: &SchedulerServer{ - modelStore: store.NewModelServerStore(log.New(), store.NewLocalSchedulerStore(), nil), + modelStore: db.NewModelServerStore(log.New(), db.NewLocalSchedulerStore(), nil), logger: log.New(), timeout: 1 * time.Millisecond, }, @@ -1084,7 +1085,8 @@ func TestServersStatusStream(t *testing.T) { expectedNumLoadedModelReplicas += int32(len(r.request.LoadedModels)) } else { server, _, err := test.server.modelStore.GetServer("foo", false) - server.Replicas[int(r.request.ReplicaIdx)].SetIsDraining() + g.Expect(err).To(BeNil()) + server.Replicas[int32(r.request.ReplicaIdx)].IsDraining = true } } } @@ -1160,8 +1162,8 @@ func TestModelEventsForServerStatus(t *testing.T) { }) g.Expect(err).To(BeNil()) err = s.modelStore.UpdateLoadedModels( - "foo", 1, "foo", []*store.ServerReplica{ - store.NewServerReplica("", 8080, 5001, 0, store.NewServer("foo", true), []string{}, 100, 100, 0, map[store.ModelVersionID]bool{}, 100), + "foo", 1, "foo", []*db.ServerReplica{ + db.NewServerReplica("", 8080, 5001, 0, db.NewServer("foo", true), []string{}, 100, 100, 0, map[db.ModelVersionID]bool{}, 100), }, ) g.Expect(err).To(BeNil()) @@ -1314,8 +1316,8 @@ func TestServerScaleUpEvents(t *testing.T) { }) g.Expect(err).To(BeNil()) err = s.modelStore.UpdateLoadedModels( - "foo-model", 1, "foo-server", []*store.ServerReplica{ - store.NewServerReplica("", 8080, 5001, 0, store.NewServer("foo-server", true), []string{}, 100, 100, 0, map[store.ModelVersionID]bool{}, 100), + "foo-model", 1, "foo-server", []*db.ServerReplica{ + db.NewServerReplica("", 8080, 5001, 0, db.NewServer("foo-server", true), []string{}, 100, 100, 0, map[db.ModelVersionID]bool{}, 100), }, ) g.Expect(err).To(BeNil()) @@ -1512,21 +1514,24 @@ func TestServerScaleDownEvents(t *testing.T) { } } -func createTestScheduler(t *testing.T) (*SchedulerServer, *coordinator.EventHub) { - return createTestSchedulerWithConfig(t, SchedulerServerConfig{}) +func createTestScheduler() (*SchedulerServer, *coordinator.EventHub) { + return createTestSchedulerWithConfig(SchedulerServerConfig{}) } -func createTestSchedulerWithConfig(t *testing.T, config SchedulerServerConfig) (*SchedulerServer, *coordinator.EventHub) { - return createTestSchedulerImpl(t, config) +func createTestSchedulerWithConfig(config SchedulerServerConfig) (*SchedulerServer, *coordinator.EventHub) { + return createTestSchedulerImpl(config) } -func createTestSchedulerImpl(t *testing.T, config SchedulerServerConfig) (*SchedulerServer, *coordinator.EventHub) { +func createTestSchedulerImpl(config SchedulerServerConfig) (*SchedulerServer, *coordinator.EventHub) { logger := log.New() logger.SetLevel(log.WarnLevel) eventHub, _ := coordinator.NewEventHub(logger) - schedulerStore := store.NewTestMemory(t, logger, store.NewLocalSchedulerStore(), eventHub) + modelStorage := store.NewInMemoryStorage[*db.Model]() + serverStorage := store.NewInMemoryStorage[*db.Server]() + schedulerStore := store.NewModelServerStore(logger, modelStorage, serverStorage, eventHub) + experimentServer := experiment.NewExperimentServer(logger, eventHub, nil, nil) pipelineServer := pipeline.NewPipelineStore(logger, eventHub, schedulerStore) @@ -1534,7 +1539,7 @@ func createTestSchedulerImpl(t *testing.T, config SchedulerServerConfig) (*Sched logger, schedulerStore, scheduler2.DefaultSchedulerConfig(schedulerStore), - synchroniser.NewSimpleSynchroniser(time.Duration(10*time.Millisecond)), + synchroniser.NewSimpleSynchroniser(10*time.Millisecond), eventHub, ) @@ -1542,7 +1547,7 @@ func createTestSchedulerImpl(t *testing.T, config SchedulerServerConfig) (*Sched pipelineGwLoadBalancer := util.NewRingLoadBalancer(1) s := NewSchedulerServer( logger, schedulerStore, experimentServer, pipelineServer, scheduler, - eventHub, synchroniser.NewSimpleSynchroniser(time.Duration(10*time.Millisecond)), config, + eventHub, synchroniser.NewSimpleSynchroniser(10*time.Millisecond), config, "", "", modelGwLoadBalancer, pipelineGwLoadBalancer, nil, tls.TLSOptions{}, ) From 9f69dfbfa28fc3d5f694964b4d748ccd47e4ed81 Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Mon, 12 Jan 2026 19:16:14 +0000 Subject: [PATCH 38/50] wip: fixing server tests --- .../pkg/server/experiment_status_test.go | 2 +- scheduler/pkg/server/pipeline_status_test.go | 10 +- scheduler/pkg/server/server_status_test.go | 105 ++++++++------- scheduler/pkg/server/server_test.go | 20 ++- scheduler/pkg/server/utils_test.go | 125 +++++++++--------- 5 files changed, 140 insertions(+), 122 deletions(-) diff --git a/scheduler/pkg/server/experiment_status_test.go b/scheduler/pkg/server/experiment_status_test.go index e2f8c609f5..8a9a3449dc 100644 --- a/scheduler/pkg/server/experiment_status_test.go +++ b/scheduler/pkg/server/experiment_status_test.go @@ -155,7 +155,7 @@ func TestExperimentStatusEvents(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - s, hub := createTestScheduler(t) + s, _, hub := createTestScheduler() s.timeout = test.timeout if test.loadReq != nil { err := s.experimentServer.StartExperiment(test.loadReq) diff --git a/scheduler/pkg/server/pipeline_status_test.go b/scheduler/pkg/server/pipeline_status_test.go index 9a8c4ad9f5..cfe78526ce 100644 --- a/scheduler/pkg/server/pipeline_status_test.go +++ b/scheduler/pkg/server/pipeline_status_test.go @@ -378,7 +378,7 @@ func TestPublishPipelineEventWithTimeout(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - s, hub := createTestScheduler(t) + s, _, hub := createTestScheduler() s.timeout = test.timeout if test.loadReq != nil { err := s.pipelineHandler.AddPipeline(test.loadReq.Pipeline) @@ -459,7 +459,7 @@ func TestAddAndRemovePipelineNoPipelineGw(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - s, _ := createTestScheduler(t) + s, _, _ := createTestScheduler() // add operator stream stream := newStubPipelineStatusServer(100, 5*time.Millisecond, test.ctx) @@ -581,7 +581,7 @@ func TestPipelineGwRebalanceNoPipelineGw(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - s, _ := createTestScheduler(t) + s, _, _ := createTestScheduler() // add operator stream stream := newStubPipelineStatusServer(1, 5*time.Millisecond, test.ctx) @@ -714,7 +714,7 @@ func TestPipelineGwRebalanceCorrectMessages(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { // create a test scheduler - note it uses a load balancer with 1 partition - s, _ := createTestScheduler(t) + s, _, _ := createTestScheduler() // create operator stream operatorStream := newStubPipelineStatusServer(1, 5*time.Millisecond, test.ctx) @@ -959,7 +959,7 @@ func TestPipelineGwRebalance(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - s, _ := createTestScheduler(t) + s, _, _ := createTestScheduler() var streams []*stubPipelineStatusServer for i := 0; i < test.replicas; i++ { diff --git a/scheduler/pkg/server/server_status_test.go b/scheduler/pkg/server/server_status_test.go index c0c2012bc5..caccf013a0 100644 --- a/scheduler/pkg/server/server_status_test.go +++ b/scheduler/pkg/server/server_status_test.go @@ -264,7 +264,7 @@ func TestTerminateModelGwVersionModels(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - s, _ := createTestScheduler() + s, modelDirectStore, _ := createTestScheduler() for _, lr := range test.loadReq { err := s.modelStore.UpdateModel(lr) g.Expect(err).To(BeNil()) @@ -277,16 +277,17 @@ func TestTerminateModelGwVersionModels(t *testing.T) { // check number of versions g.Expect(model.Versions).To(HaveLen(2)) - // set model-gw status to available - mem, ok := s.modelStore.(*db.TestMemoryStore) - g.Expect(ok).To(BeTrue()) - - err = mem.DirectlyUpdateModelStatus(db.ModelID{ - Name: modelName, - Version: model.GetLatest().GetVersion(), - }, db.ModelStatus{ - ModelGwState: db.ModelAvailable, - }) + mv := model.Latest() + mv.State.ModelGwState = db.ModelState_ModelAvailable + err = modelDirectStore.Update(context.TODO(), model) + g.Expect(err).To(BeNil()) + // + //err = mem.DirectlyUpdateModelStatus(db.ModelID{ + // Name: modelName, + // Version: model.GetLatest().GetVersion(), + //}, db.ModelStatus{ + // ModelGwState: db.ModelAvailable, + //}) g.Expect(err).To(BeNil()) // check if latest version is available @@ -303,9 +304,9 @@ func TestTerminateModelGwVersionModels(t *testing.T) { model, err = s.modelStore.GetModel(modelName) g.Expect(err).To(BeNil()) - mv := model.GetPrevious() + mv = model.Versions[len(model.Versions)-2] g.Expect(mv).ToNot(BeNil()) - g.Expect(mv.ModelState().ModelGwState).To(Equal(db.ModelTerminated)) + g.Expect(mv.State.ModelGwState).To(Equal(db.ModelState_ModelTerminated)) }) } @@ -333,7 +334,7 @@ func TestModelsStatusStream(t *testing.T) { }, }, server: &SchedulerServer{ - modelStore: db.NewModelServerStore(log.New(), db.NewLocalSchedulerStore(), nil), + modelStore: store.NewModelServerStore(log.New(), store.NewInMemoryStorage[*db.Model](), store.NewInMemoryStorage[*db.Server](), nil), logger: log.New(), timeout: 10 * time.Millisecond, }, @@ -347,7 +348,7 @@ func TestModelsStatusStream(t *testing.T) { }, }, server: &SchedulerServer{ - modelStore: db.NewModelServerStore(log.New(), db.NewLocalSchedulerStore(), nil), + modelStore: store.NewModelServerStore(log.New(), store.NewInMemoryStorage[*db.Model](), store.NewInMemoryStorage[*db.Server](), nil), logger: log.New(), timeout: 10 * time.Millisecond, }, @@ -362,7 +363,7 @@ func TestModelsStatusStream(t *testing.T) { }, }, server: &SchedulerServer{ - modelStore: db.NewModelServerStore(log.New(), db.NewLocalSchedulerStore(), nil), + modelStore: store.NewModelServerStore(log.New(), store.NewInMemoryStorage[*db.Model](), store.NewInMemoryStorage[*db.Server](), nil), logger: log.New(), timeout: 1 * time.Millisecond, }, @@ -432,7 +433,7 @@ func TestPublishModelsStatusWithTimeout(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - s, hub := createTestScheduler() + s, _, hub := createTestScheduler() s.timeout = test.timeout if test.loadReq != nil { err := s.modelStore.UpdateModel(test.loadReq) @@ -504,7 +505,7 @@ func TestAddAndRemoveModelNoModelGw(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - s, hub := createTestScheduler(t) + s, _, hub := createTestScheduler() stream := newStubModelStatusServer(2, 5*time.Millisecond, context.Background()) s.modelEventStream.streams[stream] = &ModelSubscription{ @@ -587,7 +588,7 @@ func TestModelGwRebalanceNoPipelineGw(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - s, hub := createTestScheduler(t) + s, _, hub := createTestScheduler() stream := newStubModelStatusServer(2, 5*time.Millisecond, context.Background()) s.modelEventStream.streams[stream] = &ModelSubscription{ @@ -691,7 +692,7 @@ func TestModelGwRebalanceCorrectMessages(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - s, hub := createTestScheduler() + s, _, hub := createTestScheduler() // create operator stream operatorStream := newStubModelStatusServer(1, 5*time.Millisecond, test.ctx) @@ -758,14 +759,14 @@ func TestModelGwRebalanceCorrectMessages(t *testing.T) { ms, err := s.modelStore.GetModel(modelName) g.Expect(err).To(BeNil()) - mv := ms.GetLatest() - g.Expect(mv.ModelState().ModelGwState).To(Equal(test.modelGwStatus)) + mv := ms.Latest() + g.Expect(mv.State.ModelGwState).To(Equal(test.modelGwStatus)) // trigger rebalance s.modelGwRebalance() // check message is received by the operator - if test.modelGwStatus != db.ModelTerminating { + if test.modelGwStatus != db.ModelState_ModelTerminating { msr = receiveMessageFromModelStream(operatorStream) g.Expect(msr).ToNot(BeNil()) g.Expect(msr.ModelName).To(Equal("foo")) @@ -872,7 +873,7 @@ func TestModelGwRebalance(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - s, _ := createTestScheduler(t) + s, modelDirectStore, _ := createTestScheduler() var streams []*stubModelStatusServer for i := 0; i < test.replicas; i++ { @@ -898,16 +899,21 @@ func TestModelGwRebalance(t *testing.T) { modelName := req.Model.Meta.Name model, _ := s.modelStore.GetModel(modelName) - mem, ok := s.modelStore.(*db.TestMemoryStore) - g.Expect(ok).To(BeTrue()) - - err = mem.DirectlyUpdateModelStatus(db.ModelID{ - Name: modelName, - Version: model.GetLatest().GetVersion(), - }, db.ModelStatus{ - ModelGwState: db.ModelAvailable, - AvailableReplicas: 1, - }) + mv := model.Latest() + mv.State.ModelGwState = db.ModelState_ModelAvailable + err = modelDirectStore.Update(context.TODO(), model) + g.Expect(err).To(BeNil()) + // + //mem, ok := s.modelStore.(*db.TestMemoryStore) + //g.Expect(ok).To(BeTrue()) + // + //err = mem.DirectlyUpdateModelStatus(db.ModelID{ + // Name: modelName, + // Version: model.GetLatest().GetVersion(), + //}, db.ModelStatus{ + // ModelGwState: db.ModelAvailable, + // AvailableReplicas: 1, + //}) g.Expect(err).To(BeNil()) } @@ -976,7 +982,7 @@ func TestServersStatusStream(t *testing.T) { }, }, server: &SchedulerServer{ - modelStore: db.NewModelServerStore(log.New(), db.NewLocalSchedulerStore(), nil), + modelStore: store.NewModelServerStore(log.New(), store.NewInMemoryStorage[*db.Model](), store.NewInMemoryStorage[*db.Server](), nil), logger: log.New(), timeout: 10 * time.Millisecond, }, @@ -1012,7 +1018,7 @@ func TestServersStatusStream(t *testing.T) { }, }, server: &SchedulerServer{ - modelStore: db.NewModelServerStore(log.New(), db.NewLocalSchedulerStore(), nil), + modelStore: store.NewModelServerStore(log.New(), store.NewInMemoryStorage[*db.Model](), store.NewInMemoryStorage[*db.Server](), nil), logger: log.New(), timeout: 10 * time.Millisecond, }, @@ -1049,7 +1055,7 @@ func TestServersStatusStream(t *testing.T) { }, }, server: &SchedulerServer{ - modelStore: db.NewModelServerStore(log.New(), db.NewLocalSchedulerStore(), nil), + modelStore: store.NewModelServerStore(log.New(), store.NewInMemoryStorage[*db.Model](), store.NewInMemoryStorage[*db.Server](), nil), logger: log.New(), timeout: 10 * time.Millisecond, }, @@ -1064,7 +1070,7 @@ func TestServersStatusStream(t *testing.T) { }, }, server: &SchedulerServer{ - modelStore: db.NewModelServerStore(log.New(), db.NewLocalSchedulerStore(), nil), + modelStore: store.NewModelServerStore(log.New(), store.NewInMemoryStorage[*db.Model](), store.NewInMemoryStorage[*db.Server](), nil), logger: log.New(), timeout: 1 * time.Millisecond, }, @@ -1150,7 +1156,7 @@ func TestModelEventsForServerStatus(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - s, _ := createTestScheduler(t) + s, _, _ := createTestScheduler() s.timeout = test.timeout if test.loadReq != nil { err := s.modelStore.AddServerReplica(test.loadReq) @@ -1163,7 +1169,9 @@ func TestModelEventsForServerStatus(t *testing.T) { g.Expect(err).To(BeNil()) err = s.modelStore.UpdateLoadedModels( "foo", 1, "foo", []*db.ServerReplica{ - db.NewServerReplica("", 8080, 5001, 0, db.NewServer("foo", true), []string{}, 100, 100, 0, map[db.ModelVersionID]bool{}, 100), + util.NewTestServerReplica("", 8080, 5001, 0, + store.NewServer("foo", true), []string{}, 100, 100, + 0, []*db.ModelVersionID{}, 100), }, ) g.Expect(err).To(BeNil()) @@ -1292,8 +1300,7 @@ func TestServerScaleUpEvents(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - s, hub := createTestSchedulerWithConfig( - t, + s, _, hub := createTestSchedulerWithConfig( SchedulerServerConfig{ AutoScalingServerEnabled: true, }, @@ -1317,7 +1324,8 @@ func TestServerScaleUpEvents(t *testing.T) { g.Expect(err).To(BeNil()) err = s.modelStore.UpdateLoadedModels( "foo-model", 1, "foo-server", []*db.ServerReplica{ - db.NewServerReplica("", 8080, 5001, 0, db.NewServer("foo-server", true), []string{}, 100, 100, 0, map[db.ModelVersionID]bool{}, 100), + util.NewTestServerReplica("", 8080, 5001, 0, store.NewServer("foo-server", true), + []string{}, 100, 100, 0, []*db.ModelVersionID{}, 100), }, ) g.Expect(err).To(BeNil()) @@ -1451,8 +1459,7 @@ func TestServerScaleDownEvents(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - s, event := createTestSchedulerWithConfig( - t, + s, _, event := createTestSchedulerWithConfig( SchedulerServerConfig{ AutoScalingServerEnabled: test.enabled, }, @@ -1514,15 +1521,15 @@ func TestServerScaleDownEvents(t *testing.T) { } } -func createTestScheduler() (*SchedulerServer, *coordinator.EventHub) { +func createTestScheduler() (*SchedulerServer, store.Storage[*db.Model], *coordinator.EventHub) { return createTestSchedulerWithConfig(SchedulerServerConfig{}) } -func createTestSchedulerWithConfig(config SchedulerServerConfig) (*SchedulerServer, *coordinator.EventHub) { +func createTestSchedulerWithConfig(config SchedulerServerConfig) (*SchedulerServer, store.Storage[*db.Model], *coordinator.EventHub) { return createTestSchedulerImpl(config) } -func createTestSchedulerImpl(config SchedulerServerConfig) (*SchedulerServer, *coordinator.EventHub) { +func createTestSchedulerImpl(config SchedulerServerConfig) (*SchedulerServer, store.Storage[*db.Model], *coordinator.EventHub) { logger := log.New() logger.SetLevel(log.WarnLevel) @@ -1551,5 +1558,5 @@ func createTestSchedulerImpl(config SchedulerServerConfig) (*SchedulerServer, *c "", "", modelGwLoadBalancer, pipelineGwLoadBalancer, nil, tls.TLSOptions{}, ) - return s, eventHub + return s, modelStorage, eventHub } diff --git a/scheduler/pkg/server/server_test.go b/scheduler/pkg/server/server_test.go index 782ebd8da9..09cc177072 100644 --- a/scheduler/pkg/server/server_test.go +++ b/scheduler/pkg/server/server_test.go @@ -18,6 +18,7 @@ import ( "time" . "github.com/onsi/gomega" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" log "github.com/sirupsen/logrus" "google.golang.org/grpc" "google.golang.org/grpc/codes" @@ -58,7 +59,9 @@ func TestLoadModel(t *testing.T) { eventHub, err := coordinator.NewEventHub(logger) g.Expect(err).To(BeNil()) - schedulerStore := store.NewModelServerStore(logger, store.NewLocalSchedulerStore(), eventHub) + modelStorage := store.NewInMemoryStorage[*db.Model]() + serverStorage := store.NewInMemoryStorage[*db.Server]() + schedulerStore := store.NewModelServerStore(logger, modelStorage, serverStorage, eventHub) experimentServer := experiment.NewExperimentServer(logger, eventHub, nil, nil) pipelineServer := pipeline.NewPipelineStore(logger, eventHub, schedulerStore) sync := synchroniser.NewSimpleSynchroniser(time.Duration(10 * time.Millisecond)) @@ -327,8 +330,8 @@ func TestLoadModel(t *testing.T) { return } model, _ := s.modelStore.GetModel(event.ModelName) - latest := model.GetLatest() - if latest.ModelState().State == store.ScheduleFailed { + latest := model.Latest() + if latest.State.State == db.ModelState_ScheduleFailed { scheduledFailed.Store(true) } else { scheduledFailed.Store(false) @@ -364,7 +367,10 @@ func TestUnloadModel(t *testing.T) { log.SetLevel(log.DebugLevel) eventHub, err := coordinator.NewEventHub(logger) g.Expect(err).To(BeNil()) - schedulerStore := store.NewModelServerStore(logger, store.NewLocalSchedulerStore(), eventHub) + + modelStorage := store.NewInMemoryStorage[*db.Model]() + serverStorage := store.NewInMemoryStorage[*db.Server]() + schedulerStore := store.NewModelServerStore(logger, modelStorage, serverStorage, eventHub) experimentServer := experiment.NewExperimentServer(logger, eventHub, nil, nil) pipelineServer := pipeline.NewPipelineStore(logger, eventHub, schedulerStore) mockAgent := &mockAgentHandler{} @@ -466,7 +472,7 @@ func TestUnloadModel(t *testing.T) { g.Expect(r).ToNot(BeNil()) ms, err := s.modelStore.GetModel(modelName) g.Expect(err).To(BeNil()) - g.Expect(ms.GetLatest().ModelState().State).To(Equal(test.modelState)) + g.Expect(ms.Latest().State.State).To(Equal(test.modelState)) } }) @@ -710,7 +716,9 @@ func TestServerNotify(t *testing.T) { log.SetLevel(log.DebugLevel) eventHub, err := coordinator.NewEventHub(logger) g.Expect(err).To(BeNil()) - schedulerStore := store.NewModelServerStore(logger, store.NewLocalSchedulerStore(), eventHub) + modelStorage := store.NewInMemoryStorage[*db.Model]() + serverStorage := store.NewInMemoryStorage[*db.Server]() + schedulerStore := store.NewModelServerStore(logger, modelStorage, serverStorage, eventHub) sync := synchroniser.NewSimpleSynchroniser(time.Duration(10 * time.Millisecond)) scheduler := scheduler2.NewSimpleScheduler(logger, schedulerStore, diff --git a/scheduler/pkg/server/utils_test.go b/scheduler/pkg/server/utils_test.go index 28eaf12992..0e81bb73ca 100644 --- a/scheduler/pkg/server/utils_test.go +++ b/scheduler/pkg/server/utils_test.go @@ -15,6 +15,7 @@ import ( "time" . "github.com/onsi/gomega" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" ) @@ -81,7 +82,8 @@ func TestSouldScaleUp(t *testing.T) { name string shouldScaleUp bool newExpectedReplicas uint32 - server *store.ServerSnapshot + server *db.Server + stats *store.ServerStats } tests := []test{ @@ -89,62 +91,62 @@ func TestSouldScaleUp(t *testing.T) { name: "scales up to MaxReplicas", shouldScaleUp: true, newExpectedReplicas: 2, - server: &store.ServerSnapshot{ + server: &db.Server{ MaxReplicas: 2, ExpectedReplicas: 1, - Stats: &store.ServerStats{MaxNumReplicaHostedModels: 3}, }, + stats: &store.ServerStats{MaxNumReplicaHostedModels: 3}, }, { name: "scales up to MaxNumReplicaHostedModels", shouldScaleUp: true, newExpectedReplicas: 3, - server: &store.ServerSnapshot{ + server: &db.Server{ MaxReplicas: 4, ExpectedReplicas: 1, - Stats: &store.ServerStats{MaxNumReplicaHostedModels: 3}, }, + stats: &store.ServerStats{MaxNumReplicaHostedModels: 3}, }, { name: "should not scale if expectedReplicas is greater than MaxNumReplicaHostedModels", shouldScaleUp: false, - server: &store.ServerSnapshot{ + server: &db.Server{ MaxReplicas: 3, ExpectedReplicas: 3, - Stats: &store.ServerStats{MaxNumReplicaHostedModels: 2}, }, + stats: &store.ServerStats{MaxNumReplicaHostedModels: 2}, }, { name: "does not scale up for ExpectedReplicas below 0", shouldScaleUp: false, - server: &store.ServerSnapshot{ + server: &db.Server{ MaxReplicas: 2, ExpectedReplicas: -1, - Stats: &store.ServerStats{MaxNumReplicaHostedModels: 3}, }, + stats: &store.ServerStats{MaxNumReplicaHostedModels: 3}, }, { name: "does not scale up for missing max replicas", shouldScaleUp: false, - server: &store.ServerSnapshot{ + server: &db.Server{ ExpectedReplicas: 1, - Stats: &store.ServerStats{MaxNumReplicaHostedModels: 3}, }, + stats: &store.ServerStats{MaxNumReplicaHostedModels: 3}, }, { name: "does not scale to zero", shouldScaleUp: false, - server: &store.ServerSnapshot{ + server: &db.Server{ MaxReplicas: 0, ExpectedReplicas: 0, - Stats: &store.ServerStats{MaxNumReplicaHostedModels: 0}, }, + stats: &store.ServerStats{MaxNumReplicaHostedModels: 0}, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - ok, expectedReplicas := shouldScaleUp(test.server) + ok, expectedReplicas := shouldScaleUp(test.server, test.stats) g.Expect(ok).To(Equal(test.shouldScaleUp)) if test.shouldScaleUp { g.Expect(expectedReplicas).To(Equal(test.newExpectedReplicas)) @@ -158,7 +160,8 @@ func TestShouldScaleDown(t *testing.T) { type test struct { name string - server *store.ServerSnapshot + server *db.Server + stats *store.ServerStats shouldScaleDown bool expectedReplicas uint32 packThreshold float32 @@ -167,126 +170,126 @@ func TestShouldScaleDown(t *testing.T) { tests := []test{ { name: "should scale down - empty replicas", - server: &store.ServerSnapshot{ - Stats: &store.ServerStats{ - NumEmptyReplicas: 1, - MaxNumReplicaHostedModels: 0, - }, + server: &db.Server{ ExpectedReplicas: 2, MinReplicas: 1, }, + stats: &store.ServerStats{ + NumEmptyReplicas: 1, + MaxNumReplicaHostedModels: 0, + }, shouldScaleDown: true, expectedReplicas: 1, packThreshold: 0.0, }, { name: "should scale down - empty replicas > 1 - 1", - server: &store.ServerSnapshot{ - Stats: &store.ServerStats{ - NumEmptyReplicas: 2, - MaxNumReplicaHostedModels: 0, - }, + server: &db.Server{ ExpectedReplicas: 3, MinReplicas: 1, }, + stats: &store.ServerStats{ + NumEmptyReplicas: 2, + MaxNumReplicaHostedModels: 0, + }, shouldScaleDown: true, expectedReplicas: 1, packThreshold: 0.0, }, { name: "should scale down - violate min replicas", - server: &store.ServerSnapshot{ - Stats: &store.ServerStats{ - NumEmptyReplicas: 2, - MaxNumReplicaHostedModels: 0, - }, + server: &db.Server{ ExpectedReplicas: 3, MinReplicas: 2, }, + stats: &store.ServerStats{ + NumEmptyReplicas: 2, + MaxNumReplicaHostedModels: 0, + }, shouldScaleDown: true, expectedReplicas: 2, packThreshold: 0.0, }, { name: "should scale down - empty replicas > 1 - 2", - server: &store.ServerSnapshot{ - Stats: &store.ServerStats{ - NumEmptyReplicas: 1, - MaxNumReplicaHostedModels: 0, - }, + server: &db.Server{ ExpectedReplicas: 3, MinReplicas: 1, }, + stats: &store.ServerStats{ + NumEmptyReplicas: 1, + MaxNumReplicaHostedModels: 0, + }, shouldScaleDown: true, expectedReplicas: 2, packThreshold: 0.0, }, { name: "should scale down - pack", - server: &store.ServerSnapshot{ - Stats: &store.ServerStats{ - NumEmptyReplicas: 0, - MaxNumReplicaHostedModels: 1, - }, + server: &db.Server{ ExpectedReplicas: 2, MinReplicas: 1, }, + stats: &store.ServerStats{ + NumEmptyReplicas: 0, + MaxNumReplicaHostedModels: 1, + }, shouldScaleDown: true, expectedReplicas: 1, packThreshold: 1.0, }, { name: "should scale down - pack > 1", - server: &store.ServerSnapshot{ - Stats: &store.ServerStats{ - NumEmptyReplicas: 0, - MaxNumReplicaHostedModels: 1, - }, + server: &db.Server{ ExpectedReplicas: 3, MinReplicas: 1, }, + stats: &store.ServerStats{ + NumEmptyReplicas: 0, + MaxNumReplicaHostedModels: 1, + }, shouldScaleDown: true, expectedReplicas: 1, packThreshold: 1.0, }, { name: "should not scale down - pack threshold", - server: &store.ServerSnapshot{ - Stats: &store.ServerStats{ - NumEmptyReplicas: 0, - MaxNumReplicaHostedModels: 1, - }, + server: &db.Server{ ExpectedReplicas: 3, MinReplicas: 1, }, + stats: &store.ServerStats{ + NumEmptyReplicas: 0, + MaxNumReplicaHostedModels: 1, + }, shouldScaleDown: false, expectedReplicas: 0, packThreshold: 0.0, }, { name: "should not scale down - empty replicas - last replica", - server: &store.ServerSnapshot{ - Stats: &store.ServerStats{ - NumEmptyReplicas: 1, - MaxNumReplicaHostedModels: 0, - }, + server: &db.Server{ ExpectedReplicas: 1, MinReplicas: 1, }, + stats: &store.ServerStats{ + NumEmptyReplicas: 1, + MaxNumReplicaHostedModels: 0, + }, shouldScaleDown: false, expectedReplicas: 0, packThreshold: 0.0, }, { name: "should not scale down - pack - last replica", - server: &store.ServerSnapshot{ - Stats: &store.ServerStats{ - NumEmptyReplicas: 1, - MaxNumReplicaHostedModels: 0, - }, + server: &db.Server{ ExpectedReplicas: 1, MinReplicas: 1, }, + stats: &store.ServerStats{ + NumEmptyReplicas: 1, + MaxNumReplicaHostedModels: 0, + }, shouldScaleDown: false, expectedReplicas: 0, packThreshold: 1.0, @@ -295,7 +298,7 @@ func TestShouldScaleDown(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { - scaleDown, replicas := shouldScaleDown(test.server, test.packThreshold) + scaleDown, replicas := shouldScaleDown(test.server, test.stats, test.packThreshold) g.Expect(scaleDown).To(Equal(test.shouldScaleDown)) if scaleDown { g.Expect(replicas).To(Equal(test.expectedReplicas)) From f237969c1676a873a5624d77eeab8e94569063cf Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Tue, 13 Jan 2026 11:12:33 +0000 Subject: [PATCH 39/50] handle model not found when adding pipelines/experiments. it is valid for a pipeline/experiment to exist w/o a model/s --- scheduler/pkg/server/server_status_test.go | 8 ++++---- scheduler/pkg/store/experiment/state.go | 17 +++++++++++++++-- scheduler/pkg/store/experiment/validate.go | 2 +- scheduler/pkg/store/pipeline/status.go | 20 +++++++++++--------- scheduler/pkg/store/pipeline/store.go | 11 ++--------- 5 files changed, 33 insertions(+), 25 deletions(-) diff --git a/scheduler/pkg/server/server_status_test.go b/scheduler/pkg/server/server_status_test.go index caccf013a0..14473d0cf2 100644 --- a/scheduler/pkg/server/server_status_test.go +++ b/scheduler/pkg/server/server_status_test.go @@ -70,7 +70,7 @@ func TestPollerRetryFailedModels(t *testing.T) { setupMocks: func(mockModelStore *mock.MockModelServerAPI, modelNames []string, targetState db.ModelState) { mockModelStore.EXPECT(). GetAllModels(). - Return([]string{}). + Return([]string{}, nil). MinTimes(1) }, contextTimeout: 150 * time.Millisecond, @@ -85,7 +85,7 @@ func TestPollerRetryFailedModels(t *testing.T) { setupMocks: func(mockModelStore *mock.MockModelServerAPI, modelNames []string, targetState db.ModelState) { mockModelStore.EXPECT(). GetAllModels(). - Return(modelNames). + Return(modelNames, nil). MinTimes(1) model := &db.Model{} @@ -113,7 +113,7 @@ func TestPollerRetryFailedModels(t *testing.T) { setupMocks: func(mockModelStore *mock.MockModelServerAPI, modelNames []string, targetState db.ModelState) { mockModelStore.EXPECT(). GetAllModels(). - Return(modelNames).MinTimes(1) + Return(modelNames, nil).MinTimes(1) model := &db.Model{} model.Name = "failed-model" @@ -146,7 +146,7 @@ func TestPollerRetryFailedModels(t *testing.T) { setupMocks: func(mockModelStore *mock.MockModelServerAPI, modelNames []string, targetState db.ModelState) { mockModelStore.EXPECT(). GetAllModels(). - Return(modelNames).MinTimes(1) + Return(modelNames, nil).MinTimes(1) model := &db.Model{} model.Name = "failed-model" diff --git a/scheduler/pkg/store/experiment/state.go b/scheduler/pkg/store/experiment/state.go index 90b9b1919e..9e282a3d80 100644 --- a/scheduler/pkg/store/experiment/state.go +++ b/scheduler/pkg/store/experiment/state.go @@ -10,7 +10,10 @@ the Change License after the Change Date as each is defined in accordance with t package experiment import ( + "errors" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" pipeline2 "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/pipeline" ) @@ -159,7 +162,12 @@ func (es *ExperimentStore) setCandidateAndMirrorReadiness(experiment *Experiment for _, candidate := range experiment.Candidates { model, err := es.store.GetModel(candidate.Name) if err != nil { - logger.WithError(err).Infof("Failed to get model %s for candidate check for experiment %s", candidate.Name, experiment.Name) + if errors.Is(err, store.ErrNotFound) { + logger.Warnf("Model %s not found for experiment %s", candidate.Name, experiment.Name) + } else { + logger.WithError(err).Errorf("Failed to get model %s for experiment %s", candidate.Name, experiment.Name) + } + candidate.Ready = false } else { if model.Latest() != nil && model.Latest().State.State == db.ModelState_ModelAvailable { candidate.Ready = true @@ -171,7 +179,12 @@ func (es *ExperimentStore) setCandidateAndMirrorReadiness(experiment *Experiment if experiment.Mirror != nil { model, err := es.store.GetModel(experiment.Mirror.Name) if err != nil { - logger.WithError(err).Warnf("Failed to get model %s for mirror check for experiment %s", experiment.Mirror.Name, experiment.Name) + if errors.Is(err, store.ErrNotFound) { + logger.Warnf("Model %s not found for mirror experiment %s", experiment.Mirror.Name, experiment.Name) + } else { + logger.WithError(err).Errorf("Failed to get model %s for mirror experiment %s", experiment.Mirror.Name, experiment.Name) + } + experiment.Mirror.Ready = false } else { if model.Latest() != nil && model.Latest().State.State == db.ModelState_ModelAvailable { experiment.Mirror.Ready = true diff --git a/scheduler/pkg/store/experiment/validate.go b/scheduler/pkg/store/experiment/validate.go index 46d7458669..de12d3c197 100644 --- a/scheduler/pkg/store/experiment/validate.go +++ b/scheduler/pkg/store/experiment/validate.go @@ -27,7 +27,7 @@ func (es *ExperimentStore) validateNoExistingDefault(experiment *Experiment) err } } default: - return fmt.Errorf("Unknown resource type %v", experiment.ResourceType) + return fmt.Errorf("unknown resource type %v", experiment.ResourceType) } } return nil diff --git a/scheduler/pkg/store/pipeline/status.go b/scheduler/pkg/store/pipeline/status.go index 27045135de..c37e61dadf 100644 --- a/scheduler/pkg/store/pipeline/status.go +++ b/scheduler/pkg/store/pipeline/status.go @@ -10,6 +10,7 @@ the Change License after the Change Date as each is defined in accordance with t package pipeline import ( + "errors" "sync" "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" @@ -33,13 +34,9 @@ type ModelStatusHandler struct { // Set pipeline model readiness // Setup references so we can update when model status' change -func (ms *ModelStatusHandler) addPipelineModelStatus(pipeline *Pipeline) error { - err := ms.setPipelineModelsReady(pipeline.GetLatestPipelineVersion()) - if err != nil { - return err - } +func (ms *ModelStatusHandler) addPipelineModelStatus(pipeline *Pipeline) { + ms.setPipelineModelsReady(pipeline.GetLatestPipelineVersion()) ms.addModelReferences(pipeline) - return nil } // Change a pipeline model readiness based on a new model status @@ -64,13 +61,19 @@ func updatePipelineModelsReady(latestPipeline *PipelineVersion, modelAvailable b } // Set pipeline models ready by finding out if all models are ready -func (ms *ModelStatusHandler) setPipelineModelsReady(pipelineVersion *PipelineVersion) error { +func (ms *ModelStatusHandler) setPipelineModelsReady(pipelineVersion *PipelineVersion) { modelsReady := true if pipelineVersion != nil && ms.store != nil { for stepName, step := range pipelineVersion.Steps { model, err := ms.store.GetModel(stepName) if err != nil { - return err + if errors.Is(err, store.ErrNotFound) { + ms.logger.WithField("model", stepName).Warn("Model for step not found, setting model step available=false") + } + ms.logger.WithError(err).WithField("model", stepName).Error("Failed to get model for step") + modelsReady = false + step.Available = false + continue } step.Available = false if model != nil { @@ -85,7 +88,6 @@ func (ms *ModelStatusHandler) setPipelineModelsReady(pipelineVersion *PipelineVe } pipelineVersion.State.ModelsReady = modelsReady } - return nil } // Find and set Pipeline Model Ready due to a Model whose status has changed diff --git a/scheduler/pkg/store/pipeline/store.go b/scheduler/pkg/store/pipeline/store.go index 8b88e76e83..9028044e31 100644 --- a/scheduler/pkg/store/pipeline/store.go +++ b/scheduler/pkg/store/pipeline/store.go @@ -162,12 +162,8 @@ func (ps *PipelineStore) InitialiseOrRestoreDB(path string, deletedResourceTTL u // note: we do not validate the pipeline when we restore it from the db as we assume it was validated when it was added func (ps *PipelineStore) restorePipeline(pipeline *Pipeline) { - logger := ps.logger.WithField("func", "restorePipeline") ps.mu.Lock() - err := ps.modelStatusHandler.addPipelineModelStatus(pipeline) - if err != nil { - logger.WithError(err).Errorf("Failed to set pipeline state for pipeline %s", pipeline.Name) - } + ps.modelStatusHandler.addPipelineModelStatus(pipeline) ps.updatePipelineState(pipeline) ps.pipelines[pipeline.Name] = pipeline @@ -255,10 +251,7 @@ func (ps *PipelineStore) addPipelineImpl(req *scheduler.Pipeline) (*coordinator. if err != nil { return nil, err } - err = ps.modelStatusHandler.addPipelineModelStatus(pipeline) - if err != nil { - return nil, err - } + ps.modelStatusHandler.addPipelineModelStatus(pipeline) ps.pipelines[req.Name] = pipeline if ps.db != nil { err = ps.db.save(pipeline) From a5f3a83b44564a0cb4c5fb39eb66b00e4582fd6c Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Tue, 13 Jan 2026 11:59:45 +0000 Subject: [PATCH 40/50] fixed server tests --- apis/go/mlops/scheduler/db/model_ext.go | 2 +- scheduler/pkg/server/server_status_test.go | 75 +++++++++++----------- scheduler/pkg/server/server_test.go | 27 ++++---- 3 files changed, 53 insertions(+), 51 deletions(-) diff --git a/apis/go/mlops/scheduler/db/model_ext.go b/apis/go/mlops/scheduler/db/model_ext.go index 4d1eaa1887..ec81dfa91e 100644 --- a/apis/go/mlops/scheduler/db/model_ext.go +++ b/apis/go/mlops/scheduler/db/model_ext.go @@ -68,7 +68,7 @@ func (m *ModelVersion) IsLoadingOrLoaded(server string, replicaIdx int) bool { } func (m *ModelVersion) DesiredReplicas() int { - return int(m.ModelDefn.DeploymentSpec.Replicas) + return int(m.ModelDefn.GetDeploymentSpec().GetReplicas()) } func (m *ModelVersion) ModelName() string { diff --git a/scheduler/pkg/server/server_status_test.go b/scheduler/pkg/server/server_status_test.go index 14473d0cf2..a26964ee4d 100644 --- a/scheduler/pkg/server/server_status_test.go +++ b/scheduler/pkg/server/server_status_test.go @@ -967,8 +967,8 @@ func TestServersStatusStream(t *testing.T) { type test struct { name string loadReq []serverReplicaRequest - server *SchedulerServer err bool + timeout time.Duration } tests := []test{ @@ -981,11 +981,7 @@ func TestServersStatusStream(t *testing.T) { }, }, }, - server: &SchedulerServer{ - modelStore: store.NewModelServerStore(log.New(), store.NewInMemoryStorage[*db.Model](), store.NewInMemoryStorage[*db.Server](), nil), - logger: log.New(), - timeout: 10 * time.Millisecond, - }, + timeout: 10 * time.Millisecond, }, { name: "server ok - multiple replicas", @@ -1017,11 +1013,7 @@ func TestServersStatusStream(t *testing.T) { }, }, }, - server: &SchedulerServer{ - modelStore: store.NewModelServerStore(log.New(), store.NewInMemoryStorage[*db.Model](), store.NewInMemoryStorage[*db.Server](), nil), - logger: log.New(), - timeout: 10 * time.Millisecond, - }, + timeout: 10 * time.Millisecond, }, { name: "server ok - multiple replicas with draining", @@ -1054,11 +1046,7 @@ func TestServersStatusStream(t *testing.T) { draining: true, }, }, - server: &SchedulerServer{ - modelStore: store.NewModelServerStore(log.New(), store.NewInMemoryStorage[*db.Model](), store.NewInMemoryStorage[*db.Server](), nil), - logger: log.New(), - timeout: 10 * time.Millisecond, - }, + timeout: 10 * time.Millisecond, }, { name: "timeout", @@ -1069,55 +1057,64 @@ func TestServersStatusStream(t *testing.T) { }, }, }, - server: &SchedulerServer{ - modelStore: store.NewModelServerStore(log.New(), store.NewInMemoryStorage[*db.Model](), store.NewInMemoryStorage[*db.Server](), nil), - logger: log.New(), - timeout: 1 * time.Millisecond, - }, - err: true, + timeout: time.Millisecond, + err: true, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { + + modelStorage := store.NewInMemoryStorage[*db.Model]() + serverStorage := store.NewInMemoryStorage[*db.Server]() + + schedulerServer := &SchedulerServer{ + modelStore: store.NewModelServerStore(log.New(), modelStorage, serverStorage, nil), + logger: log.New(), + timeout: test.timeout, + } + expectedReplicas := int32(0) expectedNumLoadedModelReplicas := int32(0) if test.loadReq != nil { for _, r := range test.loadReq { - err := test.server.modelStore.AddServerReplica(r.request) + err := schedulerServer.modelStore.AddServerReplica(r.request) g.Expect(err).To(BeNil()) if !r.draining { expectedReplicas++ expectedNumLoadedModelReplicas += int32(len(r.request.LoadedModels)) } else { - server, _, err := test.server.modelStore.GetServer("foo", false) + server, _, err := schedulerServer.modelStore.GetServer("foo", false) g.Expect(err).To(BeNil()) server.Replicas[int32(r.request.ReplicaIdx)].IsDraining = true + err = serverStorage.Update(context.Background(), server) + g.Expect(err).To(BeNil()) } } } stream := newStubServerStatusServer(1, 5*time.Millisecond, context.Background()) - err := test.server.sendCurrentServerStatuses(stream) + err := schedulerServer.sendCurrentServerStatuses(stream) if test.err { g.Expect(err).ToNot(BeNil()) - } else { - g.Expect(err).To(BeNil()) + return + } - var ssr *pb.ServerStatusResponse - select { - case next := <-stream.msgs: - ssr = next - default: - t.Fail() - } + g.Expect(err).To(BeNil()) - g.Expect(ssr).ToNot(BeNil()) - g.Expect(ssr.ServerName).To(Equal("foo")) - g.Expect(ssr.GetAvailableReplicas()).To(Equal(expectedReplicas)) - g.Expect(ssr.NumLoadedModelReplicas).To(Equal(expectedNumLoadedModelReplicas)) - g.Expect(ssr.Type).To(Equal(pb.ServerStatusResponse_StatusUpdate)) + var ssr *pb.ServerStatusResponse + select { + case next := <-stream.msgs: + ssr = next + default: + t.Fail() } + + g.Expect(ssr).ToNot(BeNil()) + g.Expect(ssr.ServerName).To(Equal("foo")) + g.Expect(ssr.GetAvailableReplicas()).To(Equal(expectedReplicas)) + g.Expect(ssr.NumLoadedModelReplicas).To(Equal(expectedNumLoadedModelReplicas)) + g.Expect(ssr.Type).To(Equal(pb.ServerStatusResponse_StatusUpdate)) }) } } diff --git a/scheduler/pkg/server/server_test.go b/scheduler/pkg/server/server_test.go index 09cc177072..396cdf72ac 100644 --- a/scheduler/pkg/server/server_test.go +++ b/scheduler/pkg/server/server_test.go @@ -23,6 +23,7 @@ import ( "google.golang.org/grpc" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" + "google.golang.org/protobuf/proto" pba "github.com/seldonio/seldon-core/apis/go/v2/mlops/agent" pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" @@ -395,7 +396,7 @@ func TestUnloadModel(t *testing.T) { req []*pba.AgentSubscribeRequest model *pb.Model code codes.Code - modelState store.ModelState + modelState db.ModelState } modelName := "model1" smallMemory := uint64(100) @@ -410,7 +411,7 @@ func TestUnloadModel(t *testing.T) { }, model: &pb.Model{Meta: &pb.MetaData{Name: "model1"}, ModelSpec: &pb.ModelSpec{Uri: "gs://model", Requirements: []string{"sklearn"}, MemoryBytes: &smallMemory}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 1}}, code: codes.OK, - modelState: store.ModelTerminated, + modelState: db.ModelState_ModelTerminated, }, { name: "TwoReplicas", @@ -426,7 +427,7 @@ func TestUnloadModel(t *testing.T) { }, model: &pb.Model{Meta: &pb.MetaData{Name: "model1"}, ModelSpec: &pb.ModelSpec{Uri: "gs://model", Requirements: []string{"sklearn"}, MemoryBytes: &smallMemory}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 2}}, code: codes.OK, - modelState: store.ModelTerminated, + modelState: db.ModelState_ModelTerminated, }, { name: "NotExist", @@ -719,7 +720,7 @@ func TestServerNotify(t *testing.T) { modelStorage := store.NewInMemoryStorage[*db.Model]() serverStorage := store.NewInMemoryStorage[*db.Server]() schedulerStore := store.NewModelServerStore(logger, modelStorage, serverStorage, eventHub) - sync := synchroniser.NewSimpleSynchroniser(time.Duration(10 * time.Millisecond)) + sync := synchroniser.NewSimpleSynchroniser(10 * time.Millisecond) scheduler := scheduler2.NewSimpleScheduler(logger, schedulerStore, scheduler2.DefaultSchedulerConfig(schedulerStore), @@ -737,7 +738,7 @@ func TestServerNotify(t *testing.T) { type test struct { name string req *pb.ServerNotifyRequest - expectedServerStates []*store.ServerSnapshot + expectedServerStates []*db.Server signalTriggered bool } tests := []test{ @@ -760,20 +761,20 @@ func TestServerNotify(t *testing.T) { }, IsFirstSync: true, }, - expectedServerStates: []*store.ServerSnapshot{ + expectedServerStates: []*db.Server{ { Name: "server1", ExpectedReplicas: 2, MinReplicas: 1, MaxReplicas: 3, Shared: true, - Replicas: map[int]*store.ServerReplica{}, + Replicas: map[int32]*db.ServerReplica{}, }, { Name: "server2", ExpectedReplicas: 3, Shared: true, - Replicas: map[int]*store.ServerReplica{}, + Replicas: map[int32]*db.ServerReplica{}, }, }, signalTriggered: true, @@ -800,12 +801,12 @@ func TestServerNotify(t *testing.T) { }, IsFirstSync: false, }, - expectedServerStates: []*store.ServerSnapshot{ + expectedServerStates: []*db.Server{ { Name: "server1", ExpectedReplicas: 2, Shared: true, - Replicas: map[int]*store.ServerReplica{}, + Replicas: map[int32]*db.ServerReplica{}, }, }, signalTriggered: false, @@ -827,7 +828,11 @@ func TestServerNotify(t *testing.T) { sort.Slice(test.expectedServerStates, func(i, j int) bool { return test.expectedServerStates[i].Name < test.expectedServerStates[j].Name }) - g.Expect(actualServers).To(Equal(test.expectedServerStates)) + + g.Expect(len(actualServers)).To(Equal(len(test.expectedServerStates))) + for i, server := range actualServers { + g.Expect(proto.Equal(server, test.expectedServerStates[i])).To(BeTrue()) + } g.Expect(sync.IsTriggered()).To(Equal(test.signalTriggered)) }) From a683591fc30b38ddf54cea5449d1653a55101327 Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Tue, 13 Jan 2026 13:23:45 +0000 Subject: [PATCH 41/50] fix tests --- .../pkg/health-probe/http_server_test.go | 2 +- scheduler/pkg/kafka/dataflow/server_test.go | 5 ++++- scheduler/pkg/store/pipeline/status.go | 19 ++++++++++++------- scheduler/pkg/store/pipeline/store.go | 11 +++++++++-- 4 files changed, 26 insertions(+), 11 deletions(-) diff --git a/scheduler/pkg/health-probe/http_server_test.go b/scheduler/pkg/health-probe/http_server_test.go index ddb5fdbe19..89153f95d5 100644 --- a/scheduler/pkg/health-probe/http_server_test.go +++ b/scheduler/pkg/health-probe/http_server_test.go @@ -31,7 +31,7 @@ func TestHTTPServer_Start(t *testing.T) { pathLiveness = "/live" pathStartup = "/startup" - port = 8080 + port = 8085 ) tests := []struct { diff --git a/scheduler/pkg/kafka/dataflow/server_test.go b/scheduler/pkg/kafka/dataflow/server_test.go index f969a2bfa3..828a18ec0b 100644 --- a/scheduler/pkg/kafka/dataflow/server_test.go +++ b/scheduler/pkg/kafka/dataflow/server_test.go @@ -19,6 +19,7 @@ import ( "time" . "github.com/onsi/gomega" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" log "github.com/sirupsen/logrus" "go.uber.org/mock/gomock" "google.golang.org/grpc" @@ -1674,7 +1675,9 @@ func createTestScheduler(t *testing.T, serverName string) (*ChainerServer, *coor eventHub, _ := coordinator.NewEventHub(logger) - schedulerStore := store.NewModelServerStore(logger, store.NewLocalSchedulerStore(), eventHub) + modelStorage := store.NewInMemoryStorage[*db.Model]() + serverStorage := store.NewInMemoryStorage[*db.Server]() + schedulerStore := store.NewModelServerStore(logger, modelStorage, serverStorage, eventHub) pipelineServer := pipeline.NewPipelineStore(logger, eventHub, schedulerStore) data := diff --git a/scheduler/pkg/store/pipeline/status.go b/scheduler/pkg/store/pipeline/status.go index c37e61dadf..8a151a47d0 100644 --- a/scheduler/pkg/store/pipeline/status.go +++ b/scheduler/pkg/store/pipeline/status.go @@ -11,6 +11,7 @@ package pipeline import ( "errors" + "fmt" "sync" "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" @@ -34,9 +35,12 @@ type ModelStatusHandler struct { // Set pipeline model readiness // Setup references so we can update when model status' change -func (ms *ModelStatusHandler) addPipelineModelStatus(pipeline *Pipeline) { - ms.setPipelineModelsReady(pipeline.GetLatestPipelineVersion()) +func (ms *ModelStatusHandler) addPipelineModelStatus(pipeline *Pipeline) error { + if err := ms.setPipelineModelsReady(pipeline.GetLatestPipelineVersion()); err != nil { + return err + } ms.addModelReferences(pipeline) + return nil } // Change a pipeline model readiness based on a new model status @@ -61,7 +65,7 @@ func updatePipelineModelsReady(latestPipeline *PipelineVersion, modelAvailable b } // Set pipeline models ready by finding out if all models are ready -func (ms *ModelStatusHandler) setPipelineModelsReady(pipelineVersion *PipelineVersion) { +func (ms *ModelStatusHandler) setPipelineModelsReady(pipelineVersion *PipelineVersion) error { modelsReady := true if pipelineVersion != nil && ms.store != nil { for stepName, step := range pipelineVersion.Steps { @@ -69,11 +73,11 @@ func (ms *ModelStatusHandler) setPipelineModelsReady(pipelineVersion *PipelineVe if err != nil { if errors.Is(err, store.ErrNotFound) { ms.logger.WithField("model", stepName).Warn("Model for step not found, setting model step available=false") + modelsReady = false + step.Available = false + continue } - ms.logger.WithError(err).WithField("model", stepName).Error("Failed to get model for step") - modelsReady = false - step.Available = false - continue + return fmt.Errorf("failed to get model %s: %w", stepName, err) } step.Available = false if model != nil { @@ -88,6 +92,7 @@ func (ms *ModelStatusHandler) setPipelineModelsReady(pipelineVersion *PipelineVe } pipelineVersion.State.ModelsReady = modelsReady } + return nil } // Find and set Pipeline Model Ready due to a Model whose status has changed diff --git a/scheduler/pkg/store/pipeline/store.go b/scheduler/pkg/store/pipeline/store.go index 9028044e31..8b88e76e83 100644 --- a/scheduler/pkg/store/pipeline/store.go +++ b/scheduler/pkg/store/pipeline/store.go @@ -162,8 +162,12 @@ func (ps *PipelineStore) InitialiseOrRestoreDB(path string, deletedResourceTTL u // note: we do not validate the pipeline when we restore it from the db as we assume it was validated when it was added func (ps *PipelineStore) restorePipeline(pipeline *Pipeline) { + logger := ps.logger.WithField("func", "restorePipeline") ps.mu.Lock() - ps.modelStatusHandler.addPipelineModelStatus(pipeline) + err := ps.modelStatusHandler.addPipelineModelStatus(pipeline) + if err != nil { + logger.WithError(err).Errorf("Failed to set pipeline state for pipeline %s", pipeline.Name) + } ps.updatePipelineState(pipeline) ps.pipelines[pipeline.Name] = pipeline @@ -251,7 +255,10 @@ func (ps *PipelineStore) addPipelineImpl(req *scheduler.Pipeline) (*coordinator. if err != nil { return nil, err } - ps.modelStatusHandler.addPipelineModelStatus(pipeline) + err = ps.modelStatusHandler.addPipelineModelStatus(pipeline) + if err != nil { + return nil, err + } ps.pipelines[req.Name] = pipeline if ps.db != nil { err = ps.db.save(pipeline) From 5021cbe35a9648bf1bc4ae9996d963a62ced9b39 Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Wed, 14 Jan 2026 10:59:50 +0000 Subject: [PATCH 42/50] fix filter tests --- .../scheduler/filters/deletedserver_test.go | 21 ++++++------ .../scheduler/filters/replicadraining_test.go | 7 ++-- .../scheduler/filters/replicamemory_test.go | 24 +++++++------- .../scheduler/filters/serverreplicas_test.go | 23 +++++++------ .../filters/serverrequirements_test.go | 32 +++++++++---------- .../pkg/scheduler/filters/sharing_test.go | 32 +++++++++---------- 6 files changed, 66 insertions(+), 73 deletions(-) diff --git a/scheduler/pkg/scheduler/filters/deletedserver_test.go b/scheduler/pkg/scheduler/filters/deletedserver_test.go index bb71cf0678..e1cf4e3865 100644 --- a/scheduler/pkg/scheduler/filters/deletedserver_test.go +++ b/scheduler/pkg/scheduler/filters/deletedserver_test.go @@ -13,10 +13,10 @@ import ( "testing" . "github.com/onsi/gomega" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/util" pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" - - "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" ) func TestDeletedServerFilter(t *testing.T) { @@ -24,22 +24,21 @@ func TestDeletedServerFilter(t *testing.T) { type test struct { name string - model *store.ModelVersion - server *store.ServerSnapshot + model *db.ModelVersion + server *db.Server expected bool } serverName := "server1" - model := store.NewModelVersion( + model := util.NewTestModelVersion( &pb.Model{ModelSpec: &pb.ModelSpec{}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 1}}, 1, serverName, - map[int]store.ReplicaStatus{3: {State: store.Loading}}, - false, - store.ModelProgressing) + map[int32]*db.ReplicaStatus{3: {State: db.ModelReplicaState_Loading}}, + db.ModelState_ModelProgressing) tests := []test{ - {name: "DeletedServer", model: model, server: &store.ServerSnapshot{Name: serverName, Shared: true, ExpectedReplicas: 0}, expected: false}, - {name: "UnknownServerReplicas", model: model, server: &store.ServerSnapshot{Name: serverName, Shared: true, ExpectedReplicas: -1}, expected: true}, - {name: "ActiveServer", model: model, server: &store.ServerSnapshot{Name: serverName, Shared: true, ExpectedReplicas: 4}, expected: true}, + {name: "DeletedServer", model: model, server: &db.Server{Name: serverName, Shared: true, ExpectedReplicas: 0}, expected: false}, + {name: "UnknownServerReplicas", model: model, server: &db.Server{Name: serverName, Shared: true, ExpectedReplicas: -1}, expected: true}, + {name: "ActiveServer", model: model, server: &db.Server{Name: serverName, Shared: true, ExpectedReplicas: 4}, expected: true}, } for _, test := range tests { diff --git a/scheduler/pkg/scheduler/filters/replicadraining_test.go b/scheduler/pkg/scheduler/filters/replicadraining_test.go index 5bfa14da7c..dd50e8b719 100644 --- a/scheduler/pkg/scheduler/filters/replicadraining_test.go +++ b/scheduler/pkg/scheduler/filters/replicadraining_test.go @@ -13,8 +13,7 @@ import ( "testing" . "github.com/onsi/gomega" - - "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" ) func TestReplicaDrainingFilter(t *testing.T) { @@ -34,9 +33,9 @@ func TestReplicaDrainingFilter(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { filter := ReplicaDrainingFilter{} - replica := store.ServerReplica{} + replica := db.ServerReplica{} if test.isDraining { - replica.SetIsDraining() + replica.IsDraining = true } ok := filter.Filter(nil, &replica) g.Expect(ok).To(Equal(test.expected)) diff --git a/scheduler/pkg/scheduler/filters/replicamemory_test.go b/scheduler/pkg/scheduler/filters/replicamemory_test.go index 1cc3d969a9..86fc0a6c2a 100644 --- a/scheduler/pkg/scheduler/filters/replicamemory_test.go +++ b/scheduler/pkg/scheduler/filters/replicamemory_test.go @@ -13,29 +13,29 @@ import ( "testing" . "github.com/onsi/gomega" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/util" pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" - - "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" ) -func getTestModelWithMemory(requiredmemory *uint64, serverName string, replicaId int) *store.ModelVersion { +func getTestModelWithMemory(requiredmemory *uint64, serverName string, replicaId int) *db.ModelVersion { - replicas := map[int]store.ReplicaStatus{} + replicas := map[int32]*db.ReplicaStatus{} if replicaId >= 0 { - replicas[replicaId] = store.ReplicaStatus{State: store.Loading} + replicas[int32(replicaId)] = &db.ReplicaStatus{State: db.ModelReplicaState_Loading} } - return store.NewModelVersion( + return util.NewTestModelVersion( &pb.Model{ModelSpec: &pb.ModelSpec{MemoryBytes: requiredmemory}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 1}}, 1, serverName, replicas, - false, - store.ModelProgressing) + db.ModelState_ModelProgressing) } -func getTestServerReplicaWithMemory(availableMemory, reservedMemory uint64, serverName string, replicaId int) *store.ServerReplica { - return store.NewServerReplica("svc", 8080, 5001, replicaId, store.NewServer(serverName, true), []string{}, availableMemory, availableMemory, reservedMemory, nil, 100) +func getTestServerReplicaWithMemory(availableMemory, reservedMemory uint64, serverName string, replicaId int) *db.ServerReplica { + return util.NewTestServerReplica("svc", 8080, 5001, int32(replicaId), store.NewServer(serverName, true), []string{}, availableMemory, availableMemory, reservedMemory, nil, 100) } func TestReplicaMemoryFilter(t *testing.T) { @@ -43,8 +43,8 @@ func TestReplicaMemoryFilter(t *testing.T) { type test struct { name string - model *store.ModelVersion - server *store.ServerReplica + model *db.ModelVersion + server *db.ServerReplica expected bool } diff --git a/scheduler/pkg/scheduler/filters/serverreplicas_test.go b/scheduler/pkg/scheduler/filters/serverreplicas_test.go index 2a97a0c104..b8b88430a7 100644 --- a/scheduler/pkg/scheduler/filters/serverreplicas_test.go +++ b/scheduler/pkg/scheduler/filters/serverreplicas_test.go @@ -13,10 +13,10 @@ import ( "testing" . "github.com/onsi/gomega" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/util" pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" - - "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" ) func TestServerReplicasFilter(t *testing.T) { @@ -24,23 +24,22 @@ func TestServerReplicasFilter(t *testing.T) { type test struct { name string - model *store.ModelVersion - server *store.ServerSnapshot + model *db.ModelVersion + server *db.Server expected bool } serverName := "server1" - model := store.NewModelVersion( + model := util.NewTestModelVersion( &pb.Model{ModelSpec: &pb.ModelSpec{}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 1}}, 1, serverName, - map[int]store.ReplicaStatus{3: {State: store.Loading}}, - false, - store.ModelProgressing) + map[int32]*db.ReplicaStatus{3: {State: db.ModelReplicaState_Loading}}, + db.ModelState_ModelProgressing) tests := []test{ { name: "No Replicas", model: model, - server: &store.ServerSnapshot{Name: serverName, + server: &db.Server{Name: serverName, Shared: true, ExpectedReplicas: 0, }, @@ -49,11 +48,11 @@ func TestServerReplicasFilter(t *testing.T) { { name: "Replicas", model: model, - server: &store.ServerSnapshot{Name: serverName, + server: &db.Server{Name: serverName, Shared: true, ExpectedReplicas: 0, - Replicas: map[int]*store.ServerReplica{ - 0: &store.ServerReplica{}, + Replicas: map[int32]*db.ServerReplica{ + 0: {}, }, }, expected: true, diff --git a/scheduler/pkg/scheduler/filters/serverrequirements_test.go b/scheduler/pkg/scheduler/filters/serverrequirements_test.go index f4fabe66f5..b47b5b820c 100644 --- a/scheduler/pkg/scheduler/filters/serverrequirements_test.go +++ b/scheduler/pkg/scheduler/filters/serverrequirements_test.go @@ -13,17 +13,18 @@ import ( "testing" . "github.com/onsi/gomega" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/util" pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" - - "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" ) func TestServerRequirementFilter(t *testing.T) { g := NewGomegaWithT(t) - makeModel := func(requirements []string) *store.ModelVersion { - return store.NewModelVersion( + makeModel := func(requirements []string) *db.ModelVersion { + return util.NewTestModelVersion( &pb.Model{ ModelSpec: &pb.ModelSpec{ Requirements: requirements, @@ -34,34 +35,31 @@ func TestServerRequirementFilter(t *testing.T) { }, 1, "server", - map[int]store.ReplicaStatus{ - 3: {State: store.Loading}, + map[int32]*db.ReplicaStatus{ + 3: {State: db.ModelReplicaState_Loading}, }, - false, - store.ModelProgressing, + db.ModelState_ModelProgressing, ) } - makeServerReplica := func(server *store.Server, capabilities []string) *store.ServerReplica { - return store.NewServerReplica("svc", 8080, 5001, 1, store.NewServer("server", true), capabilities, 100, 100, 0, nil, 100) + makeServerReplica := func(server *db.Server, capabilities []string) *db.ServerReplica { + return util.NewTestServerReplica("svc", 8080, 5001, 1, store.NewServer("server", true), capabilities, 100, 100, 0, nil, 100) } - makeServer := func(replicas int, capabilities []string, startIdx int) *store.ServerSnapshot { + makeServer := func(replicas int, capabilities []string, startIdx int) *db.Server { server := store.NewServer("server", true) - snapshot := server.CreateSnapshot(false, false) - for i := 0; i < replicas; i++ { replica := makeServerReplica(server, capabilities) - snapshot.Replicas[i+startIdx] = replica + server.Replicas[int32(i+startIdx)] = replica } - return snapshot + return server } type test struct { name string - model *store.ModelVersion - server *store.ServerSnapshot + model *db.ModelVersion + server *db.Server expected bool } diff --git a/scheduler/pkg/scheduler/filters/sharing_test.go b/scheduler/pkg/scheduler/filters/sharing_test.go index c9010dfe6d..b052b7b58d 100644 --- a/scheduler/pkg/scheduler/filters/sharing_test.go +++ b/scheduler/pkg/scheduler/filters/sharing_test.go @@ -13,10 +13,10 @@ import ( "testing" . "github.com/onsi/gomega" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/util" pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" - - "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" ) func TestSharingFilter(t *testing.T) { @@ -24,31 +24,29 @@ func TestSharingFilter(t *testing.T) { type test struct { name string - model *store.ModelVersion - server *store.ServerSnapshot + model *db.ModelVersion + server *db.Server expected bool } serverName := "server1" - modelExplicitServer := store.NewModelVersion( + modelExplicitServer := util.NewTestModelVersion( &pb.Model{ModelSpec: &pb.ModelSpec{Server: &serverName}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 1}}, 1, serverName, - map[int]store.ReplicaStatus{3: {State: store.Loading}}, - false, - store.ModelProgressing) - modelSharedServer := store.NewModelVersion( + map[int32]*db.ReplicaStatus{3: {State: db.ModelReplicaState_Loading}}, + db.ModelState_ModelProgressing) + modelSharedServer := util.NewTestModelVersion( &pb.Model{ModelSpec: &pb.ModelSpec{}, DeploymentSpec: &pb.DeploymentSpec{Replicas: 1}}, 1, serverName, - map[int]store.ReplicaStatus{3: {State: store.Loading}}, - false, - store.ModelProgressing) + map[int32]*db.ReplicaStatus{3: {State: db.ModelReplicaState_Loading}}, + db.ModelState_ModelProgressing) tests := []test{ - {name: "ModelAndServerMatchNotShared", model: modelExplicitServer, server: &store.ServerSnapshot{Name: serverName, Shared: false}, expected: true}, - {name: "ModelAndServerMatchShared", model: modelExplicitServer, server: &store.ServerSnapshot{Name: serverName, Shared: true}, expected: true}, - {name: "ModelAndServerDontMatch", model: modelExplicitServer, server: &store.ServerSnapshot{Name: "foo", Shared: true}, expected: false}, - {name: "SharedModelAnyServer", model: modelSharedServer, server: &store.ServerSnapshot{Name: "foo", Shared: true}, expected: true}, - {name: "SharedModelNotSharedServer", model: modelSharedServer, server: &store.ServerSnapshot{Name: "foo", Shared: false}, expected: false}, + {name: "ModelAndServerMatchNotShared", model: modelExplicitServer, server: &db.Server{Name: serverName, Shared: false}, expected: true}, + {name: "ModelAndServerMatchShared", model: modelExplicitServer, server: &db.Server{Name: serverName, Shared: true}, expected: true}, + {name: "ModelAndServerDontMatch", model: modelExplicitServer, server: &db.Server{Name: "foo", Shared: true}, expected: false}, + {name: "SharedModelAnyServer", model: modelSharedServer, server: &db.Server{Name: "foo", Shared: true}, expected: true}, + {name: "SharedModelNotSharedServer", model: modelSharedServer, server: &db.Server{Name: "foo", Shared: false}, expected: false}, } for _, test := range tests { From ba96b71dbf3b6d5d8e51e423c7481fce3d279187 Mon Sep 17 00:00:00 2001 From: Miguel Angel Date: Wed, 14 Jan 2026 14:26:25 +0100 Subject: [PATCH 43/50] fix scheduler test * fix server type * wip fix scheduler test * wip more tests * fix first set of tests * test failed models --- scheduler/pkg/scheduler/scheduler_test.go | 1549 +++++++++++++++++---- scheduler/pkg/store/mesh.go | 4 +- 2 files changed, 1279 insertions(+), 274 deletions(-) diff --git a/scheduler/pkg/scheduler/scheduler_test.go b/scheduler/pkg/scheduler/scheduler_test.go index 4f789dcd92..03cf3289c4 100644 --- a/scheduler/pkg/scheduler/scheduler_test.go +++ b/scheduler/pkg/scheduler/scheduler_test.go @@ -20,13 +20,16 @@ import ( "github.com/gotidy/ptr" . "github.com/onsi/gomega" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/util" log "github.com/sirupsen/logrus" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.uber.org/mock/gomock" + ptr2 "k8s.io/utils/ptr" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/agent" pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + pbs "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" @@ -35,117 +38,6 @@ import ( mock2 "github.com/seldonio/seldon-core/scheduler/v2/pkg/synchroniser/mock" ) -type mockStore struct { - models map[string]*store.ModelSnapshot - servers []*store.ServerSnapshot - scheduledServer string - scheduledReplicas []int - unloadedModels map[string]uint32 -} - -var _ store.ModelServerAPI = (*mockStore)(nil) - -func (f mockStore) FailedScheduling(modelID string, version uint32, reason string, reset bool) error { - return nil -} - -func (f mockStore) UnloadVersionModels(modelKey string, version uint32) (bool, error) { - if f.unloadedModels != nil { - f.unloadedModels[modelKey] = version - } - return true, nil -} - -func (f mockStore) UnloadModelGwVersionModels(modelKey string, version uint32) (bool, error) { - return true, nil -} - -func (f mockStore) ServerNotify(request *pb.ServerNotify) error { - return nil -} - -func (f mockStore) RemoveModel(req *pb.UnloadModelRequest) error { - return nil -} - -func (f mockStore) UpdateModel(config *pb.LoadModelRequest) error { - return nil -} - -func (f mockStore) GetModel(key string) (*store.ModelSnapshot, error) { - return f.models[key], nil -} - -func (f mockStore) GetModels() ([]*store.ModelSnapshot, error) { - models := []*store.ModelSnapshot{} - for _, m := range f.models { - models = append(models, m) - } - return models, nil -} - -func (f mockStore) LockModel(modelId string) { -} - -func (f mockStore) UnlockModel(modelId string) { -} - -func (f mockStore) ExistsModelVersion(key string, version uint32) bool { - return false -} - -func (f mockStore) GetServers(shallow bool, modelDetails bool) ([]*store.ServerSnapshot, error) { - return f.servers, nil -} - -func (f mockStore) GetServer(serverKey string, shallow bool, modelDetails bool) (*store.ServerSnapshot, error) { - panic("implement me") -} - -func (m *mockStore) GetAllModels() []string { - var modelNames []string - for modelName := range m.models { - modelNames = append(modelNames, modelName) - } - return modelNames -} - -func (f *mockStore) UpdateLoadedModels(modelKey string, version uint32, serverKey string, replicas []*store.ServerReplica) error { - f.scheduledServer = serverKey - var replicaIdxs []int - for _, rep := range replicas { - replicaIdxs = append(replicaIdxs, rep.GetReplicaIdx()) - } - f.scheduledReplicas = replicaIdxs - return nil -} - -func (f mockStore) UpdateModelState(modelKey string, version uint32, serverKey string, replicaIdx int, availableMemory *uint64, expectedState, desiredState store.ModelReplicaState, reason string, runtimeInfo *pb.ModelRuntimeInfo) error { - panic("implement me") -} - -func (f mockStore) AddServerReplica(request *agent.AgentSubscribeRequest) error { - panic("implement me") -} - -func (f mockStore) RemoveServerReplica(serverName string, replicaIdx int) ([]string, error) { - panic("implement me") -} - -func (f mockStore) DrainServerReplica(serverName string, replicaIdx int) ([]string, error) { - panic("implement me") -} - -func (f mockStore) AddModelEventListener(c chan *store.ModelSnapshot) { -} - -func (f mockStore) AddServerEventListener(c chan string) { -} - -func (f mockStore) SetModelGwModelState(name string, versionNumber uint32, status store.ModelState, reason string, source string) error { - panic("implement me") -} - func TestScheduler(t *testing.T) { logger := log.New() g := NewGomegaWithT(t) @@ -177,32 +69,67 @@ func TestScheduler(t *testing.T) { type test struct { name string model *store.ModelSnapshot + modelName string servers []*store.ServerSnapshot scheduled bool - scheduledServer string - scheduledReplicas []int + checkServerEvents bool expectedServerEvents int + setupMock func(m *mock.MockModelServerAPI) } tests := []test{ { - name: "SmokeTest", - model: newTestModel("model1", 100, []string{"sklearn"}, 1, 0, 1, []int{}, false, "", nil), - servers: []*store.ServerSnapshot{ - { - Name: "server1", - Replicas: map[int]*store.ServerReplica{0: gsr(0, 200, []string{"sklearn"}, "server1", true, false)}, // expect schedule here - Shared: true, - ExpectedReplicas: -1, - }, + name: "SmokeTest", + modelName: "model1", + scheduled: true, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().LockModel("model1") + m.EXPECT().GetModel("model1").Return(&db.Model{Name: "model1", Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "model1"}, + ModelSpec: &pbs.ModelSpec{ + Uri: "", + ArtifactVersion: nil, + StorageConfig: nil, + Requirements: []string{"sklearn"}, + MemoryBytes: ptr2.To(uint64(200)), + Server: nil, + Parameters: nil, + ModelRuntimeInfo: nil, + ModelSpec: nil, + }, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 1, MinReplicas: 0, MaxReplicas: 1}, + }, + 1, "server1", + nil, db.ModelState_ModelProgressing), + }}, nil).MinTimes(1) + m.EXPECT().UnlockModel("model1") + servers := []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 1: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server1", false), []string{"sklearn"}, 0, 200, 0, nil, 100), + }, + Shared: true, + ExpectedReplicas: 1, + MinReplicas: 1, + MaxReplicas: 1, + KubernetesMeta: nil, + }, + } + m.EXPECT().GetServers().Return( + servers, + nil, + ) + m.EXPECT().UpdateLoadedModels("model1", uint32(1), + "server1", slices.Collect(maps.Values(servers[0].Replicas))).Return(nil) }, - scheduled: true, - scheduledServer: "server1", - scheduledReplicas: []int{0}, }, { - name: "ReplicasTwo", - model: newTestModel("model1", 100, []string{"sklearn"}, 2, 0, 2, []int{}, false, "", nil), + name: "ReplicasTwo", + model: newTestModel("model1", 100, []string{"sklearn"}, 2, 0, 2, []int{}, false, "", nil), + modelName: "model1", servers: []*store.ServerSnapshot{ { Name: "server1", @@ -220,13 +147,67 @@ func TestScheduler(t *testing.T) { ExpectedReplicas: -1, }, }, - scheduled: true, - scheduledServer: "server2", - scheduledReplicas: []int{0, 1}, + scheduled: true, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().LockModel("model1") + m.EXPECT().GetModel("model1").Return(&db.Model{Name: "model1", Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "model1"}, + ModelSpec: &pbs.ModelSpec{ + Uri: "", + ArtifactVersion: nil, + StorageConfig: nil, + Requirements: []string{"sklearn"}, + MemoryBytes: ptr2.To(uint64(200)), + Server: nil, + Parameters: nil, + ModelRuntimeInfo: nil, + ModelSpec: nil, + }, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 1, MinReplicas: 0, MaxReplicas: 1}, + }, + 1, "server1", + nil, db.ModelState_ModelProgressing), + }}, nil).MinTimes(1) + m.EXPECT().UnlockModel("model1") + servers := []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 1: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server1", false), []string{"sklearn"}, 0, 200, 0, nil, 100), + }, + Shared: true, + ExpectedReplicas: -1, + KubernetesMeta: nil, + }, + { + Name: "server2", + Replicas: map[int32]*db.ServerReplica{ + 1: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server2", false), []string{"sklearn"}, 0, 200, 0, nil, 100), + 2: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server2", false), []string{"sklearn"}, 0, 200, 0, nil, 100), + }, + Shared: true, + ExpectedReplicas: -1, + KubernetesMeta: nil, + }, + } + + expectedUpdatedServers := []*db.ServerReplica{ + util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server1", false), []string{"sklearn"}, 0, 200, 0, nil, 100), + } + m.EXPECT().GetServers().Return( + servers, + nil, + ) + m.EXPECT().UpdateLoadedModels("model1", uint32(1), + "server1", expectedUpdatedServers).Return(nil) + }, }, { - name: "NotEnoughReplicas", - model: newTestModel("model1", 100, []string{"sklearn"}, 2, 0, 2, []int{}, false, "", nil), + name: "NotEnoughReplicas", + model: newTestModel("model1", 100, []string{"sklearn"}, 2, 0, 2, []int{}, false, "", nil), + modelName: "model1", servers: []*store.ServerSnapshot{ { Name: "server1", @@ -245,10 +226,61 @@ func TestScheduler(t *testing.T) { }, }, scheduled: false, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().LockModel("model1") + m.EXPECT().GetModel("model1").Return(&db.Model{Name: "model1", Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "model1"}, + ModelSpec: &pbs.ModelSpec{ + Uri: "", + ArtifactVersion: nil, + StorageConfig: nil, + Requirements: []string{"sklearn"}, + MemoryBytes: ptr2.To(uint64(100)), + Server: nil, + Parameters: nil, + ModelRuntimeInfo: nil, + ModelSpec: nil, + }, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 2, MinReplicas: 0, MaxReplicas: 2}, + }, + 1, "server1", + nil, db.ModelState_ModelProgressing), + }}, nil).MinTimes(1) + m.EXPECT().UnlockModel("model1") + servers := []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 1: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server1", false), []string{"sklearn"}, 0, 200, 0, nil, 100), + }, + Shared: true, + ExpectedReplicas: -1, + KubernetesMeta: nil, + }, + { + Name: "server2", + Replicas: map[int32]*db.ServerReplica{ + 1: util.NewTestServerReplica("host1", 8080, 5000, 0, store.NewServer("server2", false), []string{"sklearn"}, 0, 200, 0, nil, 100), + 2: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server2", false), []string{"sklearn"}, 0, 0, 0, nil, 100), + }, + Shared: true, + ExpectedReplicas: -1, + KubernetesMeta: nil, + }, + } + m.EXPECT().GetServers().Return( + servers, + nil, + ) + m.EXPECT().FailedScheduling("model1", gomock.Any(), gomock.Any(), gomock.Any()) + }, }, { - name: "NotEnoughReplicas - schedule min replicas", - model: newTestModel("model1", 100, []string{"sklearn"}, 3, 2, 3, []int{}, false, "server2", nil), + name: "NotEnoughReplicas - schedule min replicas", + model: newTestModel("model1", 100, []string{"sklearn"}, 3, 2, 3, []int{}, false, "server2", nil), + modelName: "model1", servers: []*store.ServerSnapshot{ { Name: "server1", @@ -266,14 +298,68 @@ func TestScheduler(t *testing.T) { ExpectedReplicas: -1, }, }, - scheduled: true, // not here that we still trying to mark the model as Available - scheduledServer: "server2", - scheduledReplicas: []int{0, 1}, + scheduled: true, // not here that we still trying to mark the model as Available + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().LockModel("model1") + m.EXPECT().GetModel("model1").Return(&db.Model{Name: "model1", Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "model1"}, + ModelSpec: &pbs.ModelSpec{ + Uri: "", + ArtifactVersion: nil, + StorageConfig: nil, + Requirements: []string{"sklearn"}, + MemoryBytes: ptr2.To(uint64(200)), + Server: nil, + Parameters: nil, + ModelRuntimeInfo: nil, + ModelSpec: nil, + }, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 3, MinReplicas: 2, MaxReplicas: 3}, + }, + 1, "server2", + nil, db.ModelState_ModelProgressing), + }}, nil).MinTimes(1) + m.EXPECT().UnlockModel("model1") + servers := []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 1: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server1", false), []string{"sklearn"}, 0, 200, 0, nil, 100), + }, + Shared: true, + ExpectedReplicas: -1, + KubernetesMeta: nil, + }, + { + Name: "server2", + Replicas: map[int32]*db.ServerReplica{ + 1: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server2", false), []string{"sklearn"}, 0, 200, 0, nil, 100), + 2: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server2", false), []string{"sklearn"}, 0, 200, 0, nil, 100), + }, + Shared: true, + ExpectedReplicas: -1, + KubernetesMeta: nil, + }, + } + expectedUpdatedServers := []*db.ServerReplica{ + util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server2", false), []string{"sklearn"}, 0, 200, 0, nil, 100), + util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server2", false), []string{"sklearn"}, 0, 200, 0, nil, 100), + } + m.EXPECT().GetServers().Return( + servers, + nil, + ) + m.EXPECT().UpdateLoadedModels("model1", uint32(1), + "server2", expectedUpdatedServers).Return(nil) + }, expectedServerEvents: 1, }, { - name: "MemoryOneServer", - model: newTestModel("model1", 100, []string{"sklearn"}, 1, 0, 1, []int{}, false, "", nil), + name: "MemoryOneServer", + model: newTestModel("model1", 100, []string{"sklearn"}, 1, 0, 1, []int{}, false, "", nil), + modelName: "model1", servers: []*store.ServerSnapshot{ { Name: "server1", @@ -290,13 +376,65 @@ func TestScheduler(t *testing.T) { ExpectedReplicas: -1, }, }, - scheduled: true, - scheduledServer: "server2", - scheduledReplicas: []int{0}, + scheduled: true, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().LockModel("model1") + m.EXPECT().GetModel("model1").Return(&db.Model{Name: "model1", Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "model1"}, + ModelSpec: &pbs.ModelSpec{ + Uri: "", + ArtifactVersion: nil, + StorageConfig: nil, + Requirements: []string{"sklearn"}, + MemoryBytes: ptr2.To(uint64(200)), + Server: nil, + Parameters: nil, + ModelRuntimeInfo: nil, + ModelSpec: nil, + }, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 1, MinReplicas: 0, MaxReplicas: 1}, + }, + 1, "server2", + nil, db.ModelState_ModelProgressing), + }}, nil).MinTimes(1) + m.EXPECT().UnlockModel("model1") + servers := []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 1: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server1", false), []string{"sklearn"}, 0, 50, 0, nil, 100), + }, + Shared: true, + ExpectedReplicas: -1, + KubernetesMeta: nil, + }, + { + Name: "server2", + Replicas: map[int32]*db.ServerReplica{ + 1: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server2", false), []string{"sklearn"}, 0, 200, 0, nil, 100), + }, + Shared: true, + ExpectedReplicas: -1, + KubernetesMeta: nil, + }, + } + expectedUpdatedServers := []*db.ServerReplica{ + util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server2", false), []string{"sklearn"}, 0, 200, 0, nil, 100), + } + m.EXPECT().GetServers().Return( + servers, + nil, + ) + m.EXPECT().UpdateLoadedModels("model1", uint32(1), + "server2", expectedUpdatedServers).Return(nil) + }, }, { - name: "ModelsLoaded", - model: newTestModel("model1", 100, []string{"sklearn"}, 2, 0, 2, []int{1}, false, "", nil), + name: "ModelsLoaded", + model: newTestModel("model1", 100, []string{"sklearn"}, 2, 0, 2, []int{1}, false, "", nil), + modelName: "model1", servers: []*store.ServerSnapshot{ { Name: "server1", @@ -314,13 +452,65 @@ func TestScheduler(t *testing.T) { ExpectedReplicas: -1, }, }, - scheduled: true, - scheduledServer: "server2", - scheduledReplicas: []int{1, 0}, + scheduled: true, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().LockModel("model1") + m.EXPECT().GetModel("model1").Return(&db.Model{Name: "model1", Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "model1"}, + ModelSpec: &pbs.ModelSpec{ + Uri: "", + ArtifactVersion: nil, + StorageConfig: nil, + Requirements: []string{"sklearn"}, + MemoryBytes: ptr2.To(uint64(200)), + Server: nil, + Parameters: nil, + ModelRuntimeInfo: nil, + ModelSpec: nil, + }, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 1, MinReplicas: 0, MaxReplicas: 1}, + }, + 1, "server2", + nil, db.ModelState_ModelProgressing), + }}, nil).MinTimes(1) + m.EXPECT().UnlockModel("model1") + servers := []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 1: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server1", true), []string{"sklearn"}, 0, 50, 0, nil, 100), + }, + Shared: true, + ExpectedReplicas: -1, + KubernetesMeta: nil, + }, + { + Name: "server2", + Replicas: map[int32]*db.ServerReplica{ + 1: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server2", true), []string{"sklearn"}, 0, 200, 0, nil, 100), + }, + Shared: true, + ExpectedReplicas: -1, + KubernetesMeta: nil, + }, + } + expectedUpdatedServers := []*db.ServerReplica{ + util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server2", true), []string{"sklearn"}, 0, 200, 0, nil, 100), + } + m.EXPECT().GetServers().Return( + servers, + nil, + ) + m.EXPECT().UpdateLoadedModels("model1", uint32(1), + "server2", expectedUpdatedServers).Return(nil) + }, }, { - name: "ModelUnLoaded", - model: newTestModel("model1", 100, []string{"sklearn"}, 2, 0, 2, []int{1}, true, "server2", nil), + name: "ModelUnLoaded", + model: newTestModel("model1", 100, []string{"sklearn"}, 2, 0, 2, []int{1}, true, "server2", nil), + modelName: "model1", servers: []*store.ServerSnapshot{ { Name: "server2", @@ -332,13 +522,64 @@ func TestScheduler(t *testing.T) { ExpectedReplicas: -1, }, }, - scheduled: true, - scheduledServer: "server2", - scheduledReplicas: nil, + scheduled: true, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().LockModel("model1") + m.EXPECT().GetModel("model1").Return(&db.Model{Name: "model1", Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "model1"}, + ModelSpec: &pbs.ModelSpec{ + Uri: "", + ArtifactVersion: nil, + StorageConfig: nil, + Requirements: []string{"sklearn"}, + MemoryBytes: ptr2.To(uint64(200)), + Server: nil, + Parameters: nil, + ModelRuntimeInfo: nil, + ModelSpec: nil, + }, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 1, MinReplicas: 0, MaxReplicas: 1}, + }, + 1, "server2", + map[int32]*db.ReplicaStatus{ + 1: { + State: db.ModelReplicaState_Loaded, + Reason: "", + Timestamp: nil, + }, + }, + db.ModelState_ModelProgressing), + }}, nil).MinTimes(1) + m.EXPECT().UnlockModel("model1") + servers := []*db.Server{ + { + Name: "server2", + Replicas: map[int32]*db.ServerReplica{ + 0: util.NewTestServerReplica("host1", 8080, 5000, 0, store.NewServer("server2", true), []string{"sklearn"}, 0, 200, 0, nil, 100), + 1: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server2", true), []string{"sklearn"}, 0, 200, 0, nil, 100), + }, + Shared: true, + ExpectedReplicas: -1, + KubernetesMeta: nil, + }, + } + expectedUpdatedServers := []*db.ServerReplica{ + util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server2", true), []string{"sklearn"}, 0, 200, 0, nil, 100), + } + m.EXPECT().GetServers().Return( + servers, + nil, + ) + m.EXPECT().UpdateLoadedModels("model1", uint32(1), + "server2", expectedUpdatedServers).Return(nil) + }, }, { - name: "DeletedServer", - model: newTestModel("model1", 100, []string{"sklearn"}, 1, 0, 1, []int{}, false, "", nil), + name: "DeletedServer", + model: newTestModel("model1", 100, []string{"sklearn"}, 1, 0, 1, []int{}, false, "", nil), + modelName: "model1", servers: []*store.ServerSnapshot{ { Name: "server1", @@ -356,13 +597,68 @@ func TestScheduler(t *testing.T) { ExpectedReplicas: -1, }, }, - scheduled: true, - scheduledServer: "server2", - scheduledReplicas: []int{0}, + scheduled: true, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().LockModel("model1") + m.EXPECT().GetModel("model1").Return(&db.Model{Name: "model1", Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "model1"}, + ModelSpec: &pbs.ModelSpec{ + Uri: "", + ArtifactVersion: nil, + StorageConfig: nil, + Requirements: []string{"sklearn"}, + MemoryBytes: ptr2.To(uint64(200)), + Server: nil, + Parameters: nil, + ModelRuntimeInfo: nil, + ModelSpec: nil, + }, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 1, MinReplicas: 0, MaxReplicas: 1}, + }, + 1, "server2", + nil, db.ModelState_ModelProgressing), + }}, nil).MinTimes(1) + m.EXPECT().UnlockModel("model1") + servers := []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 1: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server1", true), []string{"sklearn"}, 0, 0, 0, nil, 100), + }, + Shared: true, + ExpectedReplicas: -1, + KubernetesMeta: nil, + }, + { + Name: "server2", + Replicas: map[int32]*db.ServerReplica{ + 1: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server2", true), []string{"sklearn"}, 0, 200, 0, nil, 100), + 2: util.NewTestServerReplica("host1", 8080, 5000, 2, store.NewServer("server2", true), []string{"sklearn"}, 0, 200, 0, nil, 100), + }, + Shared: true, + ExpectedReplicas: -1, + KubernetesMeta: nil, + }, + } + + expectedUpdatedServers := []*db.ServerReplica{ + util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server2", false), []string{"sklearn"}, 0, 200, 0, nil, 100), + } + + m.EXPECT().GetServers().Return( + servers, + nil, + ) + m.EXPECT().UpdateLoadedModels("model1", uint32(1), + "server2", expectedUpdatedServers).Return(nil) + }, }, { - name: "Reschedule", - model: newTestModel("model1", 100, []string{"sklearn"}, 1, 0, 1, []int{0}, false, "server1", nil), + name: "Reschedule", + model: newTestModel("model1", 100, []string{"sklearn"}, 1, 0, 1, []int{0}, false, "server1", nil), + modelName: "model1", servers: []*store.ServerSnapshot{ { Name: "server1", @@ -380,13 +676,75 @@ func TestScheduler(t *testing.T) { ExpectedReplicas: -1, }, }, - scheduled: true, - scheduledServer: "server2", - scheduledReplicas: []int{0}, + scheduled: true, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().LockModel("model1") + m.EXPECT().GetModel("model1").Return(&db.Model{Name: "model1", Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "model1"}, + ModelSpec: &pbs.ModelSpec{ + Uri: "", + ArtifactVersion: nil, + StorageConfig: nil, + Requirements: []string{"sklearn"}, + MemoryBytes: ptr2.To(uint64(200)), + Server: ptr.String("server1"), + Parameters: nil, + ModelRuntimeInfo: nil, + ModelSpec: nil, + }, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 1, MinReplicas: 0, MaxReplicas: 1}, + }, + 1, "server1", + map[int32]*db.ReplicaStatus{ + 0: { + State: db.ModelReplicaState_Loaded, + Reason: "", + Timestamp: nil, + }, + }, + db.ModelState_ModelProgressing), + }}, nil).MinTimes(1) + m.EXPECT().UnlockModel("model1") + servers := []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 1: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server1", true), []string{"sklearn"}, 0, 200, 0, nil, 100), + }, + Shared: true, + ExpectedReplicas: 0, + KubernetesMeta: nil, + }, + { + Name: "server2", + Replicas: map[int32]*db.ServerReplica{ + 0: util.NewTestServerReplica("host1", 8080, 5000, 0, store.NewServer("server2", true), []string{"sklearn"}, 0, 200, 0, nil, 100), + 1: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server2", true), []string{"sklearn"}, 0, 200, 0, nil, 100), + }, + Shared: true, + ExpectedReplicas: -1, + KubernetesMeta: nil, + }, + } + + expectedUpdatedServers := []*db.ServerReplica{ + util.NewTestServerReplica("host1", 8080, 5000, 0, store.NewServer("server2", true), []string{"sklearn"}, 0, 200, 0, nil, 100), + } + + m.EXPECT().GetServers().Return( + servers, + nil, + ) + m.EXPECT().UpdateLoadedModels("model1", uint32(1), + "server1", expectedUpdatedServers).Return(nil) + }, }, { - name: "DeletedServerFail", - model: newTestModel("model1", 100, []string{"sklearn"}, 1, 0, 1, []int{1}, false, "", nil), + name: "DeletedServerFail", + model: newTestModel("model1", 100, []string{"sklearn"}, 1, 0, 1, []int{1}, false, "", nil), + modelName: "model1", servers: []*store.ServerSnapshot{ { Name: "server1", @@ -396,10 +754,58 @@ func TestScheduler(t *testing.T) { }, }, scheduled: false, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().LockModel("model1") + m.EXPECT().GetModel("model1").Return(&db.Model{Name: "model1", Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "model1"}, + ModelSpec: &pbs.ModelSpec{ + Uri: "", + ArtifactVersion: nil, + StorageConfig: nil, + Requirements: []string{"sklearn"}, + MemoryBytes: ptr2.To(uint64(100)), + Server: nil, + Parameters: nil, + ModelRuntimeInfo: nil, + ModelSpec: nil, + }, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 1, MinReplicas: 0, MaxReplicas: 1}, + }, + 1, "", + map[int32]*db.ReplicaStatus{ + 0: { + State: db.ModelReplicaState_Loaded, + Reason: "", + Timestamp: nil, + }, + }, + db.ModelState_ModelProgressing), + }}, nil).MinTimes(1) + m.EXPECT().UnlockModel("model1") + servers := []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: util.NewTestServerReplica("host1", 8080, 5000, 0, store.NewServer("server1", false), []string{"sklearn"}, 0, 50, 0, nil, 100), + }, + Shared: true, + ExpectedReplicas: -1, + KubernetesMeta: nil, + }, + } + m.EXPECT().GetServers().Return( + servers, + nil, + ) + m.EXPECT().FailedScheduling("model1", gomock.Any(), gomock.Any(), gomock.Any()) + }, }, { - name: "Available memory sorting", - model: newTestModel("model1", 100, []string{"sklearn"}, 1, 0, 1, []int{1}, false, "", nil), + name: "Available memory sorting", + model: newTestModel("model1", 100, []string{"sklearn"}, 1, 0, 1, []int{1}, false, "", nil), + modelName: "model1", servers: []*store.ServerSnapshot{ { Name: "server2", @@ -411,13 +817,59 @@ func TestScheduler(t *testing.T) { ExpectedReplicas: -1, }, }, - scheduled: true, - scheduledServer: "server2", - scheduledReplicas: []int{1}, + scheduled: true, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().LockModel("model1") + m.EXPECT().GetModel("model1").Return(&db.Model{Name: "model1", Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "model1"}, + ModelSpec: &pbs.ModelSpec{ + Uri: "", + ArtifactVersion: nil, + StorageConfig: nil, + Requirements: []string{"sklearn"}, + MemoryBytes: ptr2.To(uint64(100)), + Server: nil, + Parameters: nil, + ModelRuntimeInfo: nil, + ModelSpec: nil, + }, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 1, MinReplicas: 0, MaxReplicas: 1}, + }, + 1, "server2", + nil, db.ModelState_ModelProgressing), + }}, nil).MinTimes(1) + m.EXPECT().UnlockModel("model1") + servers := []*db.Server{ + { + Name: "server2", + Replicas: map[int32]*db.ServerReplica{ + 0: util.NewTestServerReplica("host1", 8080, 5000, 0, store.NewServer("server2", true), []string{"sklearn"}, 0, 150, 0, nil, 100), + 1: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server2", true), []string{"sklearn"}, 0, 200, 0, nil, 100), + }, + Shared: true, + ExpectedReplicas: -1, + KubernetesMeta: nil, + }, + } + + expectedUpdatedServers := []*db.ServerReplica{ + util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server2", true), []string{"sklearn"}, 0, 200, 0, nil, 100), + } + + m.EXPECT().GetServers().Return( + servers, + nil, + ) + m.EXPECT().UpdateLoadedModels("model1", uint32(1), + "server2", expectedUpdatedServers).Return(nil) + }, }, { - name: "Available memory sorting with multiple replicas", - model: newTestModel("model1", 100, []string{"sklearn"}, 2, 0, 1, []int{1}, false, "", nil), + name: "Available memory sorting with multiple replicas", + model: newTestModel("model1", 100, []string{"sklearn"}, 2, 0, 1, []int{1}, false, "", nil), + modelName: "model1", servers: []*store.ServerSnapshot{ { Name: "server2", @@ -430,13 +882,61 @@ func TestScheduler(t *testing.T) { ExpectedReplicas: -1, }, }, - scheduled: true, - scheduledServer: "server2", - scheduledReplicas: []int{1, 2}, + scheduled: true, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().LockModel("model1") + m.EXPECT().GetModel("model1").Return(&db.Model{Name: "model1", Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "model1"}, + ModelSpec: &pbs.ModelSpec{ + Uri: "", + ArtifactVersion: nil, + StorageConfig: nil, + Requirements: []string{"sklearn"}, + MemoryBytes: ptr2.To(uint64(100)), + Server: nil, + Parameters: nil, + ModelRuntimeInfo: nil, + ModelSpec: nil, + }, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 2, MinReplicas: 0, MaxReplicas: 1}, + }, + 1, "server2", + nil, db.ModelState_ModelProgressing), + }}, nil).MinTimes(1) + m.EXPECT().UnlockModel("model1") + servers := []*db.Server{ + { + Name: "server2", + Replicas: map[int32]*db.ServerReplica{ + 0: util.NewTestServerReplica("host1", 8080, 5000, 0, store.NewServer("server2", true), []string{"sklearn"}, 0, 150, 0, nil, 100), + 1: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server2", true), []string{"sklearn"}, 0, 200, 0, nil, 100), + 2: util.NewTestServerReplica("host1", 8080, 5000, 2, store.NewServer("server2", true), []string{"sklearn"}, 0, 175, 0, nil, 100), + }, + Shared: true, + ExpectedReplicas: -1, + KubernetesMeta: nil, + }, + } + + expectedUpdatedServers := []*db.ServerReplica{ + util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server2", true), []string{"sklearn"}, 0, 200, 0, nil, 100), + util.NewTestServerReplica("host1", 8080, 5000, 2, store.NewServer("server2", true), []string{"sklearn"}, 0, 175, 0, nil, 100), + } + + m.EXPECT().GetServers().Return( + servers, + nil, + ) + m.EXPECT().UpdateLoadedModels("model1", uint32(1), + "server2", expectedUpdatedServers).Return(nil) + }, }, { - name: "Scale up", - model: newTestModel("model1", 100, []string{"sklearn"}, 3, 0, 3, []int{1, 2}, false, "server1", nil), + name: "Scale up", + model: newTestModel("model1", 100, []string{"sklearn"}, 3, 0, 3, []int{1, 2}, false, "server1", nil), + modelName: "model1", servers: []*store.ServerSnapshot{ { Name: "server1", @@ -450,13 +950,74 @@ func TestScheduler(t *testing.T) { ExpectedReplicas: -1, }, }, - scheduled: true, - scheduledServer: "server1", - scheduledReplicas: []int{1, 2, 3}, + scheduled: true, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().LockModel("model1") + m.EXPECT().GetModel("model1").Return(&db.Model{Name: "model1", Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "model1"}, + ModelSpec: &pbs.ModelSpec{ + Uri: "", + ArtifactVersion: nil, + StorageConfig: nil, + Requirements: []string{"sklearn"}, + MemoryBytes: ptr2.To(uint64(100)), + Server: nil, + Parameters: nil, + ModelRuntimeInfo: nil, + ModelSpec: nil, + }, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 3, MinReplicas: 0, MaxReplicas: 3}, + }, + 1, "server1", + map[int32]*db.ReplicaStatus{ + + 1: { + State: db.ModelReplicaState_Loaded, + Reason: "", + Timestamp: nil, + }, + 2: { + State: db.ModelReplicaState_Loaded, + Reason: "", + Timestamp: nil, + }, + }, + db.ModelState_ModelProgressing), + }}, nil).MinTimes(1) + m.EXPECT().UnlockModel("model1") + servers := []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: util.NewTestServerReplica("host1", 8080, 5000, 0, store.NewServer("server1", true), []string{"sklearn"}, 0, 50, 0, nil, 100), + 1: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server1", true), []string{"sklearn"}, 0, 200, 0, nil, 100), + 2: util.NewTestServerReplica("host1", 8080, 5000, 2, store.NewServer("server1", true), []string{"sklearn"}, 0, 175, 0, nil, 100), + 3: util.NewTestServerReplica("host1", 8080, 5000, 3, store.NewServer("server1", true), []string{"sklearn"}, 0, 100, 0, nil, 100), + }, + Shared: true, + ExpectedReplicas: -1, + KubernetesMeta: nil, + }, + } + expectedUpdatedServers := []*db.ServerReplica{ + util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server1", true), []string{"sklearn"}, 0, 200, 0, nil, 100), + util.NewTestServerReplica("host1", 8080, 5000, 2, store.NewServer("server1", true), []string{"sklearn"}, 0, 175, 0, nil, 100), + util.NewTestServerReplica("host1", 8080, 5000, 3, store.NewServer("server1", true), []string{"sklearn"}, 0, 100, 0, nil, 100), + } + m.EXPECT().GetServers().Return( + servers, + nil, + ) + m.EXPECT().UpdateLoadedModels("model1", uint32(1), + "server1", expectedUpdatedServers).Return(nil) + }, }, { - name: "Scale down", - model: newTestModel("model1", 100, []string{"sklearn"}, 1, 0, 1, []int{1, 2}, false, "server1", nil), + name: "Scale down", + model: newTestModel("model1", 100, []string{"sklearn"}, 1, 0, 1, []int{1, 2}, false, "server1", nil), + modelName: "model1", servers: []*store.ServerSnapshot{ { Name: "server1", @@ -470,13 +1031,72 @@ func TestScheduler(t *testing.T) { ExpectedReplicas: -1, }, }, - scheduled: true, - scheduledServer: "server1", - scheduledReplicas: []int{1}, + scheduled: true, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().LockModel("model1") + m.EXPECT().GetModel("model1").Return(&db.Model{Name: "model1", Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "model1"}, + ModelSpec: &pbs.ModelSpec{ + Uri: "", + ArtifactVersion: nil, + StorageConfig: nil, + Requirements: []string{"sklearn"}, + MemoryBytes: ptr2.To(uint64(100)), + Server: nil, + Parameters: nil, + ModelRuntimeInfo: nil, + ModelSpec: nil, + }, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 1, MinReplicas: 0, MaxReplicas: 1}, + }, + 1, "server1", + map[int32]*db.ReplicaStatus{ + + 1: { + State: db.ModelReplicaState_Loaded, + Reason: "", + Timestamp: nil, + }, + 2: { + State: db.ModelReplicaState_Loaded, + Reason: "", + Timestamp: nil, + }, + }, + db.ModelState_ModelProgressing), + }}, nil).MinTimes(1) + m.EXPECT().UnlockModel("model1") + servers := []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: util.NewTestServerReplica("host1", 8080, 5000, 0, store.NewServer("server1", true), []string{"sklearn"}, 0, 50, 0, nil, 100), + 1: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server1", true), []string{"sklearn"}, 0, 200, 0, nil, 100), + 2: util.NewTestServerReplica("host1", 8080, 5000, 2, store.NewServer("server1", true), []string{"sklearn"}, 0, 175, 0, nil, 100), + 3: util.NewTestServerReplica("host1", 8080, 5000, 3, store.NewServer("server1", true), []string{"sklearn"}, 0, 100, 0, nil, 100), + }, + Shared: true, + ExpectedReplicas: -1, + KubernetesMeta: nil, + }, + } + expectedUpdatedServers := []*db.ServerReplica{ + util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server1", true), []string{"sklearn"}, 0, 200, 0, nil, 100), + } + m.EXPECT().GetServers().Return( + servers, + nil, + ) + m.EXPECT().UpdateLoadedModels("model1", uint32(1), + "server1", expectedUpdatedServers).Return(nil) + }, }, { - name: "Scale up - not enough replicas use max of the server", - model: newTestModel("model1", 100, []string{"sklearn"}, 5, 3, 5, []int{1, 2}, false, "server1", nil), + name: "Scale up - not enough replicas use max of the server", + model: newTestModel("model1", 100, []string{"sklearn"}, 5, 3, 5, []int{1, 2}, false, "server1", nil), + modelName: "model1", servers: []*store.ServerSnapshot{ { Name: "server1", @@ -491,13 +1111,76 @@ func TestScheduler(t *testing.T) { }, }, scheduled: true, // note that we are still trying to make the model as Available - scheduledServer: "server1", - scheduledReplicas: []int{0, 1, 2, 3}, // used all replicas expectedServerEvents: 1, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().LockModel("model1") + m.EXPECT().GetModel("model1").Return(&db.Model{Name: "model1", Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "model1"}, + ModelSpec: &pbs.ModelSpec{ + Uri: "", + ArtifactVersion: nil, + StorageConfig: nil, + Requirements: []string{"sklearn"}, + MemoryBytes: ptr2.To(uint64(100)), + Server: nil, + Parameters: nil, + ModelRuntimeInfo: nil, + ModelSpec: nil, + }, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 5, MinReplicas: 3, MaxReplicas: 5}, + }, + 1, "server1", + map[int32]*db.ReplicaStatus{ + + 1: { + State: db.ModelReplicaState_Loaded, + Reason: "", + Timestamp: nil, + }, + 2: { + State: db.ModelReplicaState_Loaded, + Reason: "", + Timestamp: nil, + }, + }, + db.ModelState_ModelProgressing), + }}, nil).MinTimes(1) + m.EXPECT().UnlockModel("model1") + servers := []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: util.NewTestServerReplica("host1", 8080, 5000, 0, store.NewServer("server1", true), []string{"sklearn"}, 0, 100, 0, nil, 100), + 1: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server1", true), []string{"sklearn"}, 0, 100, 0, nil, 100), + 2: util.NewTestServerReplica("host1", 8080, 5000, 2, store.NewServer("server1", true), []string{"sklearn"}, 0, 100, 0, nil, 100), + 3: util.NewTestServerReplica("host1", 8080, 5000, 3, store.NewServer("server1", true), []string{"sklearn"}, 0, 100, 0, nil, 100), + }, + Shared: true, + ExpectedReplicas: -1, + KubernetesMeta: nil, + }, + } + + expectedUpdatedServers := []*db.ServerReplica{ + util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server1", true), []string{"sklearn"}, 0, 100, 0, nil, 100), + util.NewTestServerReplica("host1", 8080, 5000, 2, store.NewServer("server1", true), []string{"sklearn"}, 0, 100, 0, nil, 100), + util.NewTestServerReplica("host1", 8080, 5000, 0, store.NewServer("server1", true), []string{"sklearn"}, 0, 100, 0, nil, 100), + util.NewTestServerReplica("host1", 8080, 5000, 3, store.NewServer("server1", true), []string{"sklearn"}, 0, 100, 0, nil, 100), + } + m.EXPECT().GetServers().Return( + servers, + nil, + ) + m.EXPECT().UpdateLoadedModels("model1", uint32(1), + "server1", expectedUpdatedServers).Return(nil) + }, }, { - name: "Scale up - no capacity on loaded replica servers, should still go there", - model: newTestModel("model1", 100, []string{"sklearn"}, 3, 0, 3, []int{1, 2}, false, "server1", nil), + name: "Scale up - no capacity on loaded replica servers, should still go there", + model: newTestModel("model1", 100, []string{"sklearn"}, 3, 0, 3, []int{1, 2}, false, "server1", nil), + modelName: "model1", servers: []*store.ServerSnapshot{ { Name: "server1", @@ -511,13 +1194,75 @@ func TestScheduler(t *testing.T) { ExpectedReplicas: -1, }, }, - scheduled: true, - scheduledServer: "server1", - scheduledReplicas: []int{1, 2, 3}, + scheduled: true, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().LockModel("model1") + m.EXPECT().GetModel("model1").Return(&db.Model{Name: "model1", Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "model1"}, + ModelSpec: &pbs.ModelSpec{ + Uri: "", + ArtifactVersion: nil, + StorageConfig: nil, + Requirements: []string{"sklearn"}, + MemoryBytes: ptr2.To(uint64(100)), + Server: nil, + Parameters: nil, + ModelRuntimeInfo: nil, + ModelSpec: nil, + }, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 3, MinReplicas: 0, MaxReplicas: 3}, + }, + 1, "server1", + map[int32]*db.ReplicaStatus{ + + 1: { + State: db.ModelReplicaState_Loaded, + Reason: "", + Timestamp: nil, + }, + 2: { + State: db.ModelReplicaState_Loaded, + Reason: "", + Timestamp: nil, + }, + }, + db.ModelState_ModelProgressing), + }}, nil).MinTimes(1) + m.EXPECT().UnlockModel("model1") + servers := []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: util.NewTestServerReplica("host1", 8080, 5000, 0, store.NewServer("server1", true), []string{"sklearn"}, 0, 50, 0, nil, 100), + 1: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server1", true), []string{"sklearn"}, 0, 0, 0, nil, 100), + 2: util.NewTestServerReplica("host1", 8080, 5000, 2, store.NewServer("server1", true), []string{"sklearn"}, 0, 0, 0, nil, 100), + 3: util.NewTestServerReplica("host1", 8080, 5000, 3, store.NewServer("server1", true), []string{"sklearn"}, 0, 100, 0, nil, 100), + }, + Shared: true, + ExpectedReplicas: -1, + KubernetesMeta: nil, + }, + } + + expectedUpdatedServers := []*db.ServerReplica{ + util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server1", true), []string{"sklearn"}, 0, 0, 0, nil, 100), + util.NewTestServerReplica("host1", 8080, 5000, 2, store.NewServer("server1", true), []string{"sklearn"}, 0, 0, 0, nil, 100), + util.NewTestServerReplica("host1", 8080, 5000, 3, store.NewServer("server1", true), []string{"sklearn"}, 0, 100, 0, nil, 100), + } + m.EXPECT().GetServers().Return( + servers, + nil, + ) + m.EXPECT().UpdateLoadedModels("model1", uint32(1), + "server1", expectedUpdatedServers).Return(nil) + }, }, { - name: "Scale down - no capacity on loaded replica servers, should still go there", - model: newTestModel("model1", 100, []string{"sklearn"}, 1, 0, 1, []int{1, 2}, false, "server1", nil), + name: "Scale down - no capacity on loaded replica servers, should still go there", + model: newTestModel("model1", 100, []string{"sklearn"}, 1, 0, 1, []int{1, 2}, false, "server1", nil), + modelName: "model1", servers: []*store.ServerSnapshot{ { Name: "server1", @@ -531,13 +1276,75 @@ func TestScheduler(t *testing.T) { ExpectedReplicas: -1, }, }, - scheduled: true, - scheduledServer: "server1", - scheduledReplicas: []int{1}, + scheduled: true, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().LockModel("model1") + m.EXPECT().GetModel("model1").Return(&db.Model{Name: "model1", Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "model1"}, + ModelSpec: &pbs.ModelSpec{ + Uri: "", + ArtifactVersion: nil, + StorageConfig: nil, + Requirements: []string{"sklearn"}, + MemoryBytes: ptr2.To(uint64(100)), + Server: nil, + Parameters: nil, + ModelRuntimeInfo: nil, + ModelSpec: nil, + }, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 3, MinReplicas: 0, MaxReplicas: 3}, + }, + 1, "server1", + map[int32]*db.ReplicaStatus{ + + 1: { + State: db.ModelReplicaState_Loaded, + Reason: "", + Timestamp: nil, + }, + 2: { + State: db.ModelReplicaState_Loaded, + Reason: "", + Timestamp: nil, + }, + }, + db.ModelState_ModelProgressing), + }}, nil).MinTimes(1) + m.EXPECT().UnlockModel("model1") + servers := []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: util.NewTestServerReplica("host1", 8080, 5000, 0, store.NewServer("server1", true), []string{"sklearn"}, 0, 50, 0, nil, 100), + 1: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server1", true), []string{"sklearn"}, 0, 0, 0, nil, 100), + 2: util.NewTestServerReplica("host1", 8080, 5000, 2, store.NewServer("server1", true), []string{"sklearn"}, 0, 0, 0, nil, 100), + 3: util.NewTestServerReplica("host1", 8080, 5000, 3, store.NewServer("server1", true), []string{"sklearn"}, 0, 100, 0, nil, 100), + }, + Shared: true, + ExpectedReplicas: -1, + KubernetesMeta: nil, + }, + } + + expectedUpdatedServers := []*db.ServerReplica{ + util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server1", true), []string{"sklearn"}, 0, 0, 0, nil, 100), + util.NewTestServerReplica("host1", 8080, 5000, 2, store.NewServer("server1", true), []string{"sklearn"}, 0, 0, 0, nil, 100), + util.NewTestServerReplica("host1", 8080, 5000, 3, store.NewServer("server1", true), []string{"sklearn"}, 0, 100, 0, nil, 100), + } + m.EXPECT().GetServers().Return( + servers, + nil, + ) + m.EXPECT().UpdateLoadedModels("model1", uint32(1), + "server1", expectedUpdatedServers).Return(nil) + }, }, { - name: "Drain", - model: newTestModel("model1", 100, []string{"sklearn"}, 2, 0, 2, []int{1}, false, "server1", []int{2}), + name: "Drain", + model: newTestModel("model1", 100, []string{"sklearn"}, 2, 0, 2, []int{1}, false, "server1", []int{2}), + modelName: "model1", servers: []*store.ServerSnapshot{ { Name: "server1", @@ -551,25 +1358,81 @@ func TestScheduler(t *testing.T) { ExpectedReplicas: -1, }, }, - scheduled: true, - scheduledServer: "server1", - scheduledReplicas: []int{1, 3}, - }, - } + scheduled: true, + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().LockModel("model1") + m.EXPECT().GetModel("model1").Return(&db.Model{Name: "model1", Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "model1"}, + ModelSpec: &pbs.ModelSpec{ + Uri: "", + ArtifactVersion: nil, + StorageConfig: nil, + Requirements: []string{"sklearn"}, + MemoryBytes: ptr2.To(uint64(100)), + Server: nil, + Parameters: nil, + ModelRuntimeInfo: nil, + ModelSpec: nil, + }, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 3, MinReplicas: 0, MaxReplicas: 3}, + }, + 1, "server1", + map[int32]*db.ReplicaStatus{ + + 1: { + State: db.ModelReplicaState_Loaded, + Reason: "", + Timestamp: nil, + }, + 2: { + State: db.ModelReplicaState_Loaded, + Reason: "", + Timestamp: nil, + }, + }, + db.ModelState_ModelProgressing), + }}, nil).MinTimes(1) + m.EXPECT().UnlockModel("model1") + servers := []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: util.NewTestServerReplica("host1", 8080, 5000, 0, store.NewServer("server1", true), []string{"sklearn"}, 0, 50, 0, nil, 100), + 1: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server1", true), []string{"sklearn"}, 0, 200, 0, nil, 100), + 2: util.NewTestServerReplica("host1", 8080, 5000, 2, store.NewServer("server1", true), []string{"sklearn"}, 0, 175, 0, nil, 100), + 3: util.NewTestServerReplica("host1", 8080, 5000, 3, store.NewServer("server1", true), []string{"sklearn"}, 0, 100, 0, nil, 100), + }, + Shared: true, + ExpectedReplicas: -1, + KubernetesMeta: nil, + }, + } - newMockStore := func(model *store.ModelSnapshot, servers []*store.ServerSnapshot) *mockStore { - modelMap := make(map[string]*store.ModelSnapshot) - modelMap[model.Name] = model - return &mockStore{ - models: modelMap, - servers: servers, - } + expectedUpdatedServers := []*db.ServerReplica{ + util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server1", true), []string{"sklearn"}, 0, 200, 0, nil, 100), + util.NewTestServerReplica("host1", 8080, 5000, 2, store.NewServer("server1", true), []string{"sklearn"}, 0, 175, 0, nil, 100), + util.NewTestServerReplica("host1", 8080, 5000, 3, store.NewServer("server1", true), []string{"sklearn"}, 0, 100, 0, nil, 100), + } + m.EXPECT().GetServers().Return( + servers, + nil, + ) + m.EXPECT().UpdateLoadedModels("model1", uint32(1), + "server1", expectedUpdatedServers).Return(nil) + }, + }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { eventHub, _ := coordinator.NewEventHub(logger) + ctrl := gomock.NewController(t) + mockModelServerAPI := mock.NewMockModelServerAPI(ctrl) + test.setupMock(mockModelServerAPI) + serverEvents := int64(0) eventHub.RegisterServerEventHandler( "handler-server", @@ -578,9 +1441,8 @@ func TestScheduler(t *testing.T) { func(event coordinator.ServerEventMsg) { atomic.AddInt64(&serverEvents, 1) }, ) - mockStore := newMockStore(test.model, test.servers) - scheduler := NewSimpleScheduler(logger, mockStore, DefaultSchedulerConfig(mockStore), synchroniser.NewSimpleSynchroniser(time.Duration(10*time.Millisecond)), eventHub) - err := scheduler.Schedule(test.model.Name) + scheduler := NewSimpleScheduler(logger, mockModelServerAPI, DefaultSchedulerConfig(mockModelServerAPI), synchroniser.NewSimpleSynchroniser(time.Duration(10*time.Millisecond)), eventHub) + err := scheduler.Schedule(test.modelName) if test.scheduled { g.Expect(err).To(BeNil()) } else { @@ -589,11 +1451,8 @@ func TestScheduler(t *testing.T) { if test.expectedServerEvents > 0 { // wait for event time.Sleep(500 * time.Millisecond) } - if test.scheduledServer != "" { - g.Expect(test.scheduledServer).To(Equal(mockStore.scheduledServer)) - sort.Ints(test.scheduledReplicas) - sort.Ints(mockStore.scheduledReplicas) - g.Expect(test.scheduledReplicas).To(Equal(mockStore.scheduledReplicas)) + + if test.checkServerEvents { g.Expect(atomic.LoadInt64(&serverEvents)).To(Equal(int64(test.expectedServerEvents))) } }) @@ -610,46 +1469,183 @@ func TestFailedModels(t *testing.T) { availableReplicas uint32 } - newMockStore := func(models map[string]modelStateWithMetadata) *mockStore { - snapshots := map[string]*store.ModelSnapshot{} - for name, state := range models { - mv := store.NewModelVersion(&pb.Model{DeploymentSpec: state.deploymentSpec}, 1, "", map[int]store.ReplicaStatus{}, false, state.state) - mv.SetModelState(store.ModelStatus{ - State: state.state, - AvailableReplicas: state.availableReplicas, - }) - snapshot := &store.ModelSnapshot{ - Name: name, - Versions: []*store.ModelVersion{mv}, - } - snapshots[name] = snapshot - } - return &mockStore{ - models: snapshots, - } - } - type test struct { name string - models map[string]modelStateWithMetadata + models []*db.Model + setupMock func(m *mock.MockModelServerAPI) expectedFailedModels []string } tests := []test{ { name: "SmokeTest", - models: map[string]modelStateWithMetadata{ - "model1": {store.ScheduleFailed, &pb.DeploymentSpec{Replicas: 1}, 0}, - "model2": {store.ModelFailed, &pb.DeploymentSpec{Replicas: 1}, 0}, - "model3": {store.ModelAvailable, &pb.DeploymentSpec{Replicas: 1}, 1}, - "model4": {store.ModelAvailable, &pb.DeploymentSpec{Replicas: 2, MinReplicas: 1, MaxReplicas: 2}, 1}, // retry models that have not reached desired replicas + setupMock: func(m *mock.MockModelServerAPI) { + + model3 := util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "model3"}, + ModelSpec: &pbs.ModelSpec{ + Uri: "", + ArtifactVersion: nil, + StorageConfig: nil, + Requirements: []string{"sklearn"}, + MemoryBytes: ptr2.To(uint64(100)), + Server: nil, + Parameters: nil, + ModelRuntimeInfo: nil, + ModelSpec: nil, + }, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 1, MinReplicas: 0, MaxReplicas: 1}, + }, + 1, "server1", + map[int32]*db.ReplicaStatus{ + 1: { + State: db.ModelReplicaState_Loaded, + Reason: "", + Timestamp: nil, + }, + }, + db.ModelState_ModelAvailable) + + // set available replicas + model3.State.AvailableReplicas = 1 + + model4 := util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "model4"}, + ModelSpec: &pbs.ModelSpec{ + Uri: "", + ArtifactVersion: nil, + StorageConfig: nil, + Requirements: []string{"sklearn"}, + MemoryBytes: ptr2.To(uint64(100)), + Server: nil, + Parameters: nil, + ModelRuntimeInfo: nil, + ModelSpec: nil, + }, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 2, MinReplicas: 1, MaxReplicas: 2}, + }, + 1, "server1", + map[int32]*db.ReplicaStatus{ + 1: { + State: db.ModelReplicaState_Loaded, + Reason: "", + Timestamp: nil, + }, + }, + db.ModelState_ModelAvailable) + + // set available replicas + model4.State.AvailableReplicas = 1 + + m.EXPECT().GetModels().Return( + []*db.Model{ + { + Name: "model1", + Versions: []*db.ModelVersion{util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "model1"}, + ModelSpec: &pbs.ModelSpec{ + Uri: "", + ArtifactVersion: nil, + StorageConfig: nil, + Requirements: []string{"sklearn"}, + MemoryBytes: ptr2.To(uint64(100)), + Server: nil, + Parameters: nil, + ModelRuntimeInfo: nil, + ModelSpec: nil, + }, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 1, MinReplicas: 0, MaxReplicas: 1}, + }, + 1, "server1", + nil, + db.ModelState_ScheduleFailed), + }, + }, + { + Name: "model2", + Versions: []*db.ModelVersion{util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "model2"}, + ModelSpec: &pbs.ModelSpec{ + Uri: "", + ArtifactVersion: nil, + StorageConfig: nil, + Requirements: []string{"sklearn"}, + MemoryBytes: ptr2.To(uint64(100)), + Server: nil, + Parameters: nil, + ModelRuntimeInfo: nil, + ModelSpec: nil, + }, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 1, MinReplicas: 0, MaxReplicas: 1}, + }, + 1, "server1", + nil, + db.ModelState_ModelFailed), + }, + }, + { + Name: "model3", + Versions: []*db.ModelVersion{ + model3, + }, + }, + { + Name: "model4", + Versions: []*db.ModelVersion{ + model4, + }, + }}, nil) + }, expectedFailedModels: []string{"model1", "model2", "model4"}, }, { name: "SmokeTest", - models: map[string]modelStateWithMetadata{ - "model3": {store.ModelAvailable, &pb.DeploymentSpec{Replicas: 1}, 1}, + setupMock: func(m *mock.MockModelServerAPI) { + + model3 := util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "model3"}, + ModelSpec: &pbs.ModelSpec{ + Uri: "", + ArtifactVersion: nil, + StorageConfig: nil, + Requirements: []string{"sklearn"}, + MemoryBytes: ptr2.To(uint64(100)), + Server: nil, + Parameters: nil, + ModelRuntimeInfo: nil, + ModelSpec: nil, + }, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 1, MinReplicas: 0, MaxReplicas: 1}, + }, + 1, "server1", + map[int32]*db.ReplicaStatus{ + 1: { + State: db.ModelReplicaState_Loaded, + Reason: "", + Timestamp: nil, + }, + }, + db.ModelState_ModelAvailable) + + // set available replicas + model3.State.AvailableReplicas = 1 + + m.EXPECT().GetModels().Return( + []*db.Model{ + + { + Name: "model3", + Versions: []*db.ModelVersion{ + model3, + }, + }}, nil) + }, expectedFailedModels: nil, }, @@ -659,8 +1655,12 @@ func TestFailedModels(t *testing.T) { t.Run(test.name, func(t *testing.T) { eventHub, _ := coordinator.NewEventHub(logger) - mockStore := newMockStore(test.models) - scheduler := NewSimpleScheduler(logger, mockStore, DefaultSchedulerConfig(mockStore), synchroniser.NewSimpleSynchroniser(time.Duration(10*time.Millisecond)), eventHub) + ctrl := gomock.NewController(t) + mockModelServerAPI := mock.NewMockModelServerAPI(ctrl) + test.setupMock(mockModelServerAPI) + + //mockStore := newMockStore(test.models) + scheduler := NewSimpleScheduler(logger, mockModelServerAPI, DefaultSchedulerConfig(mockModelServerAPI), synchroniser.NewSimpleSynchroniser(10*time.Millisecond), eventHub) failedModels, err := scheduler.getFailedModels() g.Expect(err).To(BeNil()) sort.Strings(failedModels) @@ -700,15 +1700,15 @@ func TestRemoveAllVersions(t *testing.T) { return replica } - newMockStore := func(model *store.ModelSnapshot, servers []*store.ServerSnapshot) *mockStore { - modelMap := make(map[string]*store.ModelSnapshot) - modelMap[model.Name] = model - return &mockStore{ - models: modelMap, - servers: servers, - unloadedModels: make(map[string]uint32), - } - } + //newMockStore := func(model *store.ModelSnapshot, servers []*store.ServerSnapshot) *mockStore { + // modelMap := make(map[string]*store.ModelSnapshot) + // modelMap[model.Name] = model + // return &mockStore{ + // models: modelMap, + // servers: servers, + // unloadedModels: make(map[string]uint32), + // } + //} type test struct { name string @@ -755,12 +1755,17 @@ func TestRemoveAllVersions(t *testing.T) { for _, test := range tests { t.Run(test.name, func(t *testing.T) { _, _ = coordinator.NewEventHub(logger) - mockStore := newMockStore(test.model, test.servers) - scheduler := NewSimpleScheduler(logger, mockStore, DefaultSchedulerConfig(mockStore), synchroniser.NewSimpleSynchroniser(time.Duration(10*time.Millisecond)), nil) + + ctrl := gomock.NewController(t) + mockModelServerAPI := mock.NewMockModelServerAPI(ctrl) + //test.setupMock(mockModelServerAPI) + + //mockStore := newMockStore(test.model, test.servers) + scheduler := NewSimpleScheduler(logger, mockModelServerAPI, DefaultSchedulerConfig(mockModelServerAPI), synchroniser.NewSimpleSynchroniser(time.Duration(10*time.Millisecond)), nil) err := scheduler.Schedule(test.model.Name) g.Expect(err).To(BeNil()) - g.Expect(mockStore.unloadedModels[test.model.Name]).To(Equal(uint32(test.numVersions))) + //g.Expect(mockStore.unloadedModels[test.model.Name]).To(Equal(uint32(test.numVersions))) }) } } @@ -770,14 +1775,14 @@ func TestScheduleFailedModels(t *testing.T) { tests := []struct { name string - setupMocks func(*mock.MockModelStore, *mock2.MockSynchroniser) + setupMocks func(*mock.MockModelServerAPI, *mock2.MockSynchroniser) expectedModels []string expectError bool errorContains string }{ { name: "success - schedules single failed model", - setupMocks: func(ms *mock.MockModelStore, sync *mock2.MockSynchroniser) { + setupMocks: func(ms *mock.MockModelServerAPI, sync *mock2.MockSynchroniser) { sync.EXPECT().IsReady().Return(true) model1 := &store.ModelSnapshot{ @@ -820,7 +1825,7 @@ func TestScheduleFailedModels(t *testing.T) { servers := []*store.ServerSnapshot{ createServerSnapshot("server1", 1, 16000), } - ms.EXPECT().GetServers(false, true).Return(servers, nil) + ms.EXPECT().GetServers().Return(servers, nil) ms.EXPECT().UpdateLoadedModels("model1", uint32(1), "server1", slices.Collect(maps.Values(servers[0].Replicas))).Return(nil) }, @@ -829,7 +1834,7 @@ func TestScheduleFailedModels(t *testing.T) { }, { name: "success - schedules 2 failed models", - setupMocks: func(ms *mock.MockModelStore, sync *mock2.MockSynchroniser) { + setupMocks: func(ms *mock.MockModelServerAPI, sync *mock2.MockSynchroniser) { sync.EXPECT().IsReady().Return(true) model1 := &store.ModelSnapshot{ @@ -904,7 +1909,7 @@ func TestScheduleFailedModels(t *testing.T) { servers := []*store.ServerSnapshot{ createServerSnapshot("server1", 1, 16000), } - ms.EXPECT().GetServers(false, true).Return(servers, nil) + ms.EXPECT().GetServers().Return(servers, nil) ms.EXPECT().UpdateLoadedModels("model1", uint32(1), "server1", slices.Collect(maps.Values(servers[0].Replicas))).Return(nil) @@ -914,7 +1919,7 @@ func TestScheduleFailedModels(t *testing.T) { ms.EXPECT().UnlockModel("model2") ms.EXPECT().GetModel("model2").Return(model2, nil) - ms.EXPECT().GetServers(false, true).Return(servers, nil) + ms.EXPECT().GetServers().Return(servers, nil) ms.EXPECT().UpdateLoadedModels("model2", uint32(1), "server1", slices.Collect(maps.Values(servers[0].Replicas))).Return(nil) }, @@ -923,7 +1928,7 @@ func TestScheduleFailedModels(t *testing.T) { }, { name: "failure - unable to schedule model on desired replicas or min replicas", - setupMocks: func(ms *mock.MockModelStore, sync *mock2.MockSynchroniser) { + setupMocks: func(ms *mock.MockModelServerAPI, sync *mock2.MockSynchroniser) { sync.EXPECT().IsReady().Return(true) model1 := &store.ModelSnapshot{ @@ -966,7 +1971,7 @@ func TestScheduleFailedModels(t *testing.T) { servers := []*store.ServerSnapshot{ createServerSnapshot("server1", 1, 16000), } - ms.EXPECT().GetServers(false, true).Return(servers, nil) + ms.EXPECT().GetServers().Return(servers, nil) ms.EXPECT().FailedScheduling("model1", uint32(1), "Failed to schedule model as no matching server had enough suitable replicas", true).Return(nil) }, @@ -974,7 +1979,7 @@ func TestScheduleFailedModels(t *testing.T) { }, { name: "failure - failed getting models", - setupMocks: func(ms *mock.MockModelStore, sync *mock2.MockSynchroniser) { + setupMocks: func(ms *mock.MockModelServerAPI, sync *mock2.MockSynchroniser) { sync.EXPECT().IsReady().Return(true) ms.EXPECT().GetModels().Return(nil, errors.New("some error")) }, @@ -988,7 +1993,7 @@ func TestScheduleFailedModels(t *testing.T) { t.Parallel() ctrl := gomock.NewController(t) - mockStore := mock.NewMockModelStore(ctrl) + mockStore := mock.NewMockModelServerAPI(ctrl) mockSync := mock2.NewMockSynchroniser(ctrl) tt.setupMocks(mockStore, mockSync) diff --git a/scheduler/pkg/store/mesh.go b/scheduler/pkg/store/mesh.go index 00ea70b97b..fbf9dcb87f 100644 --- a/scheduler/pkg/store/mesh.go +++ b/scheduler/pkg/store/mesh.go @@ -191,7 +191,7 @@ func NewServerReplica(inferenceSvc string, inferenceHttpPort int32, inferenceGrpcPort int32, replicaIdx int, - server *Server, + server *db.Server, capabilities []string, memory, availableMemory, @@ -203,7 +203,7 @@ func NewServerReplica(inferenceSvc string, inferenceSvc: inferenceSvc, inferenceHttpPort: inferenceHttpPort, inferenceGrpcPort: inferenceGrpcPort, - serverName: server.name, + serverName: server.Name, replicaIdx: replicaIdx, capabilities: cleanCapabilities(capabilities), memory: memory, From de82c91c81d216f33828569447701298b34619b6 Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Wed, 14 Jan 2026 13:27:27 +0000 Subject: [PATCH 44/50] fix sorter tests --- .../pkg/scheduler/sorters/loaded_test.go | 62 +++++++++---------- .../scheduler/sorters/replicaindex_test.go | 21 ++++--- .../scheduler/sorters/replicamemory_test.go | 28 ++++----- 3 files changed, 55 insertions(+), 56 deletions(-) diff --git a/scheduler/pkg/scheduler/sorters/loaded_test.go b/scheduler/pkg/scheduler/sorters/loaded_test.go index 153194aba0..829922eabc 100644 --- a/scheduler/pkg/scheduler/sorters/loaded_test.go +++ b/scheduler/pkg/scheduler/sorters/loaded_test.go @@ -14,8 +14,9 @@ import ( "testing" . "github.com/onsi/gomega" - + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/util" ) func TestModelAlreadyLoadedSort(t *testing.T) { @@ -24,43 +25,42 @@ func TestModelAlreadyLoadedSort(t *testing.T) { type test struct { name string replicas []*CandidateReplica - ordering []int + ordering []int32 } - model := store.NewModelVersion( + model := util.NewTestModelVersion( nil, 1, "server1", - map[int]store.ReplicaStatus{3: {State: store.Loading}}, - false, - store.ModelProgressing) - modelServer2 := store.NewModelVersion( + map[int32]*db.ReplicaStatus{3: {State: db.ModelReplicaState_Loading}}, + db.ModelState_ModelProgressing) + + modelServer2 := util.NewTestModelVersion( nil, 1, "server2", - map[int]store.ReplicaStatus{3: {State: store.Loading}}, - false, - store.ModelProgressing) + map[int32]*db.ReplicaStatus{3: {State: db.ModelReplicaState_Loading}}, + db.ModelState_ModelProgressing) server := store.NewServer("server1", true) tests := []test{ { name: "OneLoadedModel", replicas: []*CandidateReplica{ - {Model: model, Server: &store.ServerSnapshot{Name: "server1"}, Replica: store.NewServerReplica("", 8080, 5001, 2, server, []string{}, 100, 100, 0, map[store.ModelVersionID]bool{}, 100)}, - {Model: model, Server: &store.ServerSnapshot{Name: "server1"}, Replica: store.NewServerReplica("", 8080, 5001, 1, server, []string{}, 100, 100, 0, map[store.ModelVersionID]bool{}, 100)}, - {Model: model, Server: &store.ServerSnapshot{Name: "server1"}, Replica: store.NewServerReplica("", 8080, 5001, 3, server, []string{}, 100, 100, 0, map[store.ModelVersionID]bool{}, 100)}, + {Model: model, Server: &db.Server{Name: "server1"}, Replica: util.NewTestServerReplica("", 8080, 5001, 2, server, []string{}, 100, 100, 0, []*db.ModelVersionID{}, 100)}, + {Model: model, Server: &db.Server{Name: "server1"}, Replica: util.NewTestServerReplica("", 8080, 5001, 1, server, []string{}, 100, 100, 0, []*db.ModelVersionID{}, 100)}, + {Model: model, Server: &db.Server{Name: "server1"}, Replica: util.NewTestServerReplica("", 8080, 5001, 3, server, []string{}, 100, 100, 0, []*db.ModelVersionID{}, 100)}, }, - ordering: []int{3, 2, 1}, + ordering: []int32{3, 2, 1}, }, { name: "LoadedDifferentServer", replicas: []*CandidateReplica{ - {Model: modelServer2, Server: &store.ServerSnapshot{Name: "server1"}, Replica: store.NewServerReplica("", 8080, 5001, 2, server, []string{}, 100, 100, 0, map[store.ModelVersionID]bool{}, 100)}, - {Model: modelServer2, Server: &store.ServerSnapshot{Name: "server1"}, Replica: store.NewServerReplica("", 8080, 5001, 1, server, []string{}, 100, 100, 0, map[store.ModelVersionID]bool{}, 100)}, - {Model: modelServer2, Server: &store.ServerSnapshot{Name: "server1"}, Replica: store.NewServerReplica("", 8080, 5001, 3, server, []string{}, 100, 100, 0, map[store.ModelVersionID]bool{}, 100)}, + {Model: modelServer2, Server: &db.Server{Name: "server1"}, Replica: util.NewTestServerReplica("", 8080, 5001, 2, server, []string{}, 100, 100, 0, []*db.ModelVersionID{}, 100)}, + {Model: modelServer2, Server: &db.Server{Name: "server1"}, Replica: util.NewTestServerReplica("", 8080, 5001, 1, server, []string{}, 100, 100, 0, []*db.ModelVersionID{}, 100)}, + {Model: modelServer2, Server: &db.Server{Name: "server1"}, Replica: util.NewTestServerReplica("", 8080, 5001, 3, server, []string{}, 100, 100, 0, []*db.ModelVersionID{}, 100)}, }, - ordering: []int{2, 1, 3}, + ordering: []int32{2, 1, 3}, }, } @@ -84,38 +84,36 @@ func TestModelAlreadyLoadedOnServerSort(t *testing.T) { ordering []string } - modelServer1 := store.NewModelVersion( + modelServer1 := util.NewTestModelVersion( nil, 1, "server1", - map[int]store.ReplicaStatus{}, - false, - store.ModelAvailable) + map[int32]*db.ReplicaStatus{}, + db.ModelState_ModelAvailable) - modelNoServer := store.NewModelVersion( + modelNoServer := util.NewTestModelVersion( nil, 1, "", - map[int]store.ReplicaStatus{}, - false, - store.ModelStateUnknown) + map[int32]*db.ReplicaStatus{}, + db.ModelState_ModelStateUnknown) tests := []test{ { name: "LoadedOnOneServer", servers: []*CandidateServer{ - {Model: modelServer1, Server: &store.ServerSnapshot{Name: "server3"}}, - {Model: modelServer1, Server: &store.ServerSnapshot{Name: "server2"}}, - {Model: modelServer1, Server: &store.ServerSnapshot{Name: "server1"}}, + {Model: modelServer1, Server: &db.Server{Name: "server3"}}, + {Model: modelServer1, Server: &db.Server{Name: "server2"}}, + {Model: modelServer1, Server: &db.Server{Name: "server1"}}, }, ordering: []string{"server1", "server3", "server2"}, }, { name: "Not", servers: []*CandidateServer{ - {Model: modelNoServer, Server: &store.ServerSnapshot{Name: "server3"}}, - {Model: modelNoServer, Server: &store.ServerSnapshot{Name: "server2"}}, - {Model: modelNoServer, Server: &store.ServerSnapshot{Name: "server1"}}, + {Model: modelNoServer, Server: &db.Server{Name: "server3"}}, + {Model: modelNoServer, Server: &db.Server{Name: "server2"}}, + {Model: modelNoServer, Server: &db.Server{Name: "server1"}}, }, ordering: []string{"server3", "server2", "server1"}, }, diff --git a/scheduler/pkg/scheduler/sorters/replicaindex_test.go b/scheduler/pkg/scheduler/sorters/replicaindex_test.go index 444ba47757..7b4fc12384 100644 --- a/scheduler/pkg/scheduler/sorters/replicaindex_test.go +++ b/scheduler/pkg/scheduler/sorters/replicaindex_test.go @@ -14,8 +14,9 @@ import ( "testing" . "github.com/onsi/gomega" - + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/util" ) func TestReplicaIndexSorter(t *testing.T) { @@ -24,25 +25,25 @@ func TestReplicaIndexSorter(t *testing.T) { type test struct { name string replicas []*CandidateReplica - ordering []int + ordering []int32 } - model := store.NewModelVersion( + model := util.NewTestModelVersion( nil, 1, "server1", - map[int]store.ReplicaStatus{3: {State: store.Loading}}, - false, - store.ModelProgressing) + map[int32]*db.ReplicaStatus{3: {State: db.ModelReplicaState_Loading}}, + db.ModelState_ModelProgressing) + tests := []test{ { name: "OrderByIndex", replicas: []*CandidateReplica{ - {Model: model, Replica: store.NewServerReplica("", 8080, 5001, 20, store.NewServer("dummy", true), []string{}, 100, 200, 0, map[store.ModelVersionID]bool{}, 100)}, - {Model: model, Replica: store.NewServerReplica("", 8080, 5001, 10, store.NewServer("dummy", true), []string{}, 100, 100, 0, map[store.ModelVersionID]bool{}, 100)}, - {Model: model, Replica: store.NewServerReplica("", 8080, 5001, 30, store.NewServer("dummy", true), []string{}, 100, 150, 0, map[store.ModelVersionID]bool{}, 100)}, + {Model: model, Replica: util.NewTestServerReplica("", 8080, 5001, 20, store.NewServer("dummy", true), []string{}, 100, 200, 0, []*db.ModelVersionID{}, 100)}, + {Model: model, Replica: util.NewTestServerReplica("", 8080, 5001, 10, store.NewServer("dummy", true), []string{}, 100, 100, 0, []*db.ModelVersionID{}, 100)}, + {Model: model, Replica: util.NewTestServerReplica("", 8080, 5001, 30, store.NewServer("dummy", true), []string{}, 100, 150, 0, []*db.ModelVersionID{}, 100)}, }, - ordering: []int{10, 20, 30}, + ordering: []int32{10, 20, 30}, }, } diff --git a/scheduler/pkg/scheduler/sorters/replicamemory_test.go b/scheduler/pkg/scheduler/sorters/replicamemory_test.go index e50effc076..1388f7c2e3 100644 --- a/scheduler/pkg/scheduler/sorters/replicamemory_test.go +++ b/scheduler/pkg/scheduler/sorters/replicamemory_test.go @@ -14,8 +14,9 @@ import ( "testing" . "github.com/onsi/gomega" - + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/util" ) func TestReplicaMemorySort(t *testing.T) { @@ -24,34 +25,33 @@ func TestReplicaMemorySort(t *testing.T) { type test struct { name string replicas []*CandidateReplica - ordering []int + ordering []int32 } - model := store.NewModelVersion( + model := util.NewTestModelVersion( nil, 1, "server1", - map[int]store.ReplicaStatus{3: {State: store.Loading}}, - false, - store.ModelProgressing) + map[int32]*db.ReplicaStatus{3: {State: db.ModelReplicaState_Loading}}, + db.ModelState_ModelProgressing) tests := []test{ { name: "ThreeReplicasDifferentMemory", replicas: []*CandidateReplica{ - {Model: model, Replica: store.NewServerReplica("", 8080, 5001, 1, store.NewServer("dummy", true), []string{}, 100, 100, 0, map[store.ModelVersionID]bool{}, 100)}, - {Model: model, Replica: store.NewServerReplica("", 8080, 5001, 2, store.NewServer("dummy", true), []string{}, 100, 200, 0, map[store.ModelVersionID]bool{}, 100)}, - {Model: model, Replica: store.NewServerReplica("", 8080, 5001, 3, store.NewServer("dummy", true), []string{}, 100, 150, 0, map[store.ModelVersionID]bool{}, 100)}, + {Model: model, Replica: util.NewTestServerReplica("", 8080, 5001, 1, store.NewServer("dummy", true), []string{}, 100, 100, 0, []*db.ModelVersionID{}, 100)}, + {Model: model, Replica: util.NewTestServerReplica("", 8080, 5001, 2, store.NewServer("dummy", true), []string{}, 100, 200, 0, []*db.ModelVersionID{}, 100)}, + {Model: model, Replica: util.NewTestServerReplica("", 8080, 5001, 3, store.NewServer("dummy", true), []string{}, 100, 150, 0, []*db.ModelVersionID{}, 100)}, }, - ordering: []int{2, 3, 1}, + ordering: []int32{2, 3, 1}, }, { name: "ThreeReplicasDifferentMemoryWithReserved", replicas: []*CandidateReplica{ - {Model: model, Replica: store.NewServerReplica("", 8080, 5001, 1, store.NewServer("dummy", true), []string{}, 100, 100, 0, map[store.ModelVersionID]bool{}, 100)}, - {Model: model, Replica: store.NewServerReplica("", 8080, 5001, 2, store.NewServer("dummy", true), []string{}, 100, 200, 150, map[store.ModelVersionID]bool{}, 100)}, - {Model: model, Replica: store.NewServerReplica("", 8080, 5001, 3, store.NewServer("dummy", true), []string{}, 100, 150, 0, map[store.ModelVersionID]bool{}, 100)}, + {Model: model, Replica: util.NewTestServerReplica("", 8080, 5001, 1, store.NewServer("dummy", true), []string{}, 100, 100, 0, []*db.ModelVersionID{}, 100)}, + {Model: model, Replica: util.NewTestServerReplica("", 8080, 5001, 2, store.NewServer("dummy", true), []string{}, 100, 200, 150, []*db.ModelVersionID{}, 100)}, + {Model: model, Replica: util.NewTestServerReplica("", 8080, 5001, 3, store.NewServer("dummy", true), []string{}, 100, 150, 0, []*db.ModelVersionID{}, 100)}, }, - ordering: []int{3, 1, 2}, + ordering: []int32{3, 1, 2}, }, } From 1b1a95f7e4be135d12b73eea2758da9c8b9bc19a Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Wed, 14 Jan 2026 13:41:03 +0000 Subject: [PATCH 45/50] wip --- scheduler/pkg/scheduler/scheduler_test.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scheduler/pkg/scheduler/scheduler_test.go b/scheduler/pkg/scheduler/scheduler_test.go index 03cf3289c4..0c3eb21d65 100644 --- a/scheduler/pkg/scheduler/scheduler_test.go +++ b/scheduler/pkg/scheduler/scheduler_test.go @@ -739,6 +739,8 @@ func TestScheduler(t *testing.T) { ) m.EXPECT().UpdateLoadedModels("model1", uint32(1), "server1", expectedUpdatedServers).Return(nil) + + m.EXPECT().FailedScheduling("model1", 1, "Failed to schedule model as no matching servers are available false", false).Return(nil) }, }, { From e80e6fa94b2bbede5c56b6f4433f9f4cd341b1a2 Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Wed, 14 Jan 2026 14:13:09 +0000 Subject: [PATCH 46/50] wip --- scheduler/pkg/scheduler/scheduler_test.go | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/scheduler/pkg/scheduler/scheduler_test.go b/scheduler/pkg/scheduler/scheduler_test.go index 0c3eb21d65..f2017749c5 100644 --- a/scheduler/pkg/scheduler/scheduler_test.go +++ b/scheduler/pkg/scheduler/scheduler_test.go @@ -673,7 +673,7 @@ func TestScheduler(t *testing.T) { 1: gsr(1, 200, []string{"sklearn"}, "server2", true, false), }, Shared: true, - ExpectedReplicas: -1, + ExpectedReplicas: 1, }, }, scheduled: true, @@ -724,7 +724,7 @@ func TestScheduler(t *testing.T) { 1: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server2", true), []string{"sklearn"}, 0, 200, 0, nil, 100), }, Shared: true, - ExpectedReplicas: -1, + ExpectedReplicas: 1, KubernetesMeta: nil, }, } @@ -739,8 +739,6 @@ func TestScheduler(t *testing.T) { ) m.EXPECT().UpdateLoadedModels("model1", uint32(1), "server1", expectedUpdatedServers).Return(nil) - - m.EXPECT().FailedScheduling("model1", 1, "Failed to schedule model as no matching servers are available false", false).Return(nil) }, }, { From cf19ed2c32c123a7a08f489ababce87696f2ab6c Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Wed, 14 Jan 2026 14:15:55 +0000 Subject: [PATCH 47/50] fix Reschedule test --- scheduler/pkg/scheduler/scheduler_test.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scheduler/pkg/scheduler/scheduler_test.go b/scheduler/pkg/scheduler/scheduler_test.go index f2017749c5..b0081791bb 100644 --- a/scheduler/pkg/scheduler/scheduler_test.go +++ b/scheduler/pkg/scheduler/scheduler_test.go @@ -689,7 +689,6 @@ func TestScheduler(t *testing.T) { StorageConfig: nil, Requirements: []string{"sklearn"}, MemoryBytes: ptr2.To(uint64(200)), - Server: ptr.String("server1"), Parameters: nil, ModelRuntimeInfo: nil, ModelSpec: nil, @@ -738,7 +737,7 @@ func TestScheduler(t *testing.T) { nil, ) m.EXPECT().UpdateLoadedModels("model1", uint32(1), - "server1", expectedUpdatedServers).Return(nil) + "server2", expectedUpdatedServers).Return(nil) }, }, { From 94f7670076e75176a8073582d76e3e666aca7e20 Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Wed, 14 Jan 2026 14:20:28 +0000 Subject: [PATCH 48/50] revert --- scheduler/pkg/scheduler/scheduler_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scheduler/pkg/scheduler/scheduler_test.go b/scheduler/pkg/scheduler/scheduler_test.go index b0081791bb..1b20e0559b 100644 --- a/scheduler/pkg/scheduler/scheduler_test.go +++ b/scheduler/pkg/scheduler/scheduler_test.go @@ -723,7 +723,7 @@ func TestScheduler(t *testing.T) { 1: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server2", true), []string{"sklearn"}, 0, 200, 0, nil, 100), }, Shared: true, - ExpectedReplicas: 1, + ExpectedReplicas: -1, KubernetesMeta: nil, }, } From cc9334a1d67ff89c092c4e5d309807ea602b443f Mon Sep 17 00:00:00 2001 From: Miguel Angel Date: Wed, 14 Jan 2026 16:01:26 +0100 Subject: [PATCH 49/50] scheduler part two fixes * TestScheduleFailedModels * TestRemoveAllVersions * clean up --- scheduler/pkg/scheduler/scheduler_test.go | 760 ++++++++-------------- 1 file changed, 254 insertions(+), 506 deletions(-) diff --git a/scheduler/pkg/scheduler/scheduler_test.go b/scheduler/pkg/scheduler/scheduler_test.go index 1b20e0559b..4db4941e48 100644 --- a/scheduler/pkg/scheduler/scheduler_test.go +++ b/scheduler/pkg/scheduler/scheduler_test.go @@ -28,7 +28,6 @@ import ( "go.uber.org/mock/gomock" ptr2 "k8s.io/utils/ptr" - pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" pbs "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" @@ -42,35 +41,9 @@ func TestScheduler(t *testing.T) { logger := log.New() g := NewGomegaWithT(t) - newTestModel := func(name string, requiredMemory uint64, requirements []string, replicas, minReplicas uint32, maxReplicas uint32, loadedModels []int, deleted bool, scheduledServer string, drainedModels []int) *store.ModelSnapshot { - config := &pb.Model{Meta: &pb.MetaData{Name: t.Name()}, ModelSpec: &pb.ModelSpec{MemoryBytes: &requiredMemory, Requirements: requirements}, DeploymentSpec: &pb.DeploymentSpec{Replicas: replicas, MinReplicas: minReplicas, MaxReplicas: maxReplicas}} - rmap := make(map[int]store.ReplicaStatus) - for _, ridx := range loadedModels { - rmap[ridx] = store.ReplicaStatus{State: store.Loaded} - } - for _, ridx := range drainedModels { - rmap[ridx] = store.ReplicaStatus{State: store.Draining} - } - return &store.ModelSnapshot{ - Name: name, - Versions: []*store.ModelVersion{store.NewModelVersion(config, 1, scheduledServer, rmap, false, store.ModelProgressing)}, - Deleted: deleted, - } - } - - gsr := func(replicaIdx int, availableMemory uint64, capabilities []string, serverName string, shared, isDraining bool) *store.ServerReplica { - replica := store.NewServerReplica("svc", 8080, 5001, replicaIdx, store.NewServer(serverName, shared), capabilities, availableMemory, availableMemory, 0, nil, 100) - if isDraining { - replica.SetIsDraining() - } - return replica - } - type test struct { name string - model *store.ModelSnapshot modelName string - servers []*store.ServerSnapshot scheduled bool checkServerEvents bool expectedServerEvents int @@ -128,25 +101,7 @@ func TestScheduler(t *testing.T) { }, { name: "ReplicasTwo", - model: newTestModel("model1", 100, []string{"sklearn"}, 2, 0, 2, []int{}, false, "", nil), modelName: "model1", - servers: []*store.ServerSnapshot{ - { - Name: "server1", - Replicas: map[int]*store.ServerReplica{0: gsr(0, 200, []string{"sklearn"}, "server1", true, false)}, - Shared: true, - ExpectedReplicas: -1, - }, - { - Name: "server2", - Replicas: map[int]*store.ServerReplica{ - 0: gsr(0, 200, []string{"sklearn"}, "server2", true, false), // expect schedule here - 1: gsr(1, 200, []string{"sklearn"}, "server2", true, false), // expect schedule here - }, - Shared: true, - ExpectedReplicas: -1, - }, - }, scheduled: true, setupMock: func(m *mock.MockModelServerAPI) { m.EXPECT().LockModel("model1") @@ -206,25 +161,7 @@ func TestScheduler(t *testing.T) { }, { name: "NotEnoughReplicas", - model: newTestModel("model1", 100, []string{"sklearn"}, 2, 0, 2, []int{}, false, "", nil), modelName: "model1", - servers: []*store.ServerSnapshot{ - { - Name: "server1", - Replicas: map[int]*store.ServerReplica{0: gsr(0, 200, []string{"sklearn"}, "server1", true, false)}, - Shared: true, - ExpectedReplicas: -1, - }, - { - Name: "server2", - Replicas: map[int]*store.ServerReplica{ - 0: gsr(0, 200, []string{"sklearn"}, "server2", true, false), - 1: gsr(1, 0, []string{"sklearn"}, "server2", true, false), - }, - Shared: true, - ExpectedReplicas: -1, - }, - }, scheduled: false, setupMock: func(m *mock.MockModelServerAPI) { m.EXPECT().LockModel("model1") @@ -279,25 +216,7 @@ func TestScheduler(t *testing.T) { }, { name: "NotEnoughReplicas - schedule min replicas", - model: newTestModel("model1", 100, []string{"sklearn"}, 3, 2, 3, []int{}, false, "server2", nil), modelName: "model1", - servers: []*store.ServerSnapshot{ - { - Name: "server1", - Replicas: map[int]*store.ServerReplica{0: gsr(0, 200, []string{"sklearn"}, "server1", true, false)}, - Shared: true, - ExpectedReplicas: -1, - }, - { - Name: "server2", - Replicas: map[int]*store.ServerReplica{ - 0: gsr(0, 200, []string{"sklearn"}, "server2", true, false), // expect schedule here - 1: gsr(1, 200, []string{"sklearn"}, "server2", true, false), // expect schedule here - }, - Shared: true, - ExpectedReplicas: -1, - }, - }, scheduled: true, // not here that we still trying to mark the model as Available setupMock: func(m *mock.MockModelServerAPI) { m.EXPECT().LockModel("model1") @@ -358,24 +277,7 @@ func TestScheduler(t *testing.T) { }, { name: "MemoryOneServer", - model: newTestModel("model1", 100, []string{"sklearn"}, 1, 0, 1, []int{}, false, "", nil), modelName: "model1", - servers: []*store.ServerSnapshot{ - { - Name: "server1", - Replicas: map[int]*store.ServerReplica{0: gsr(0, 50, []string{"sklearn"}, "server1", true, false)}, - Shared: true, - ExpectedReplicas: -1, - }, - { - Name: "server2", - Replicas: map[int]*store.ServerReplica{ - 0: gsr(0, 200, []string{"sklearn"}, "server2", true, false), // expect schedule here - }, - Shared: true, - ExpectedReplicas: -1, - }, - }, scheduled: true, setupMock: func(m *mock.MockModelServerAPI) { m.EXPECT().LockModel("model1") @@ -433,25 +335,7 @@ func TestScheduler(t *testing.T) { }, { name: "ModelsLoaded", - model: newTestModel("model1", 100, []string{"sklearn"}, 2, 0, 2, []int{1}, false, "", nil), modelName: "model1", - servers: []*store.ServerSnapshot{ - { - Name: "server1", - Replicas: map[int]*store.ServerReplica{0: gsr(0, 50, []string{"sklearn"}, "server1", true, false)}, - Shared: true, - ExpectedReplicas: -1, - }, - { - Name: "server2", - Replicas: map[int]*store.ServerReplica{ - 0: gsr(0, 200, []string{"sklearn"}, "server2", true, false), // expect schedule here - 1: gsr(1, 200, []string{"sklearn"}, "server2", true, false), // expect schedule here - }, - Shared: true, - ExpectedReplicas: -1, - }, - }, scheduled: true, setupMock: func(m *mock.MockModelServerAPI) { m.EXPECT().LockModel("model1") @@ -509,19 +393,7 @@ func TestScheduler(t *testing.T) { }, { name: "ModelUnLoaded", - model: newTestModel("model1", 100, []string{"sklearn"}, 2, 0, 2, []int{1}, true, "server2", nil), modelName: "model1", - servers: []*store.ServerSnapshot{ - { - Name: "server2", - Replicas: map[int]*store.ServerReplica{ - 0: gsr(0, 200, []string{"sklearn"}, "server2", true, false), - 1: gsr(1, 200, []string{"sklearn"}, "server2", true, false), - }, - Shared: true, - ExpectedReplicas: -1, - }, - }, scheduled: true, setupMock: func(m *mock.MockModelServerAPI) { m.EXPECT().LockModel("model1") @@ -578,25 +450,7 @@ func TestScheduler(t *testing.T) { }, { name: "DeletedServer", - model: newTestModel("model1", 100, []string{"sklearn"}, 1, 0, 1, []int{}, false, "", nil), modelName: "model1", - servers: []*store.ServerSnapshot{ - { - Name: "server1", - Replicas: map[int]*store.ServerReplica{0: gsr(0, 200, []string{"sklearn"}, "server1", true, false)}, - Shared: true, - ExpectedReplicas: 0, - }, - { - Name: "server2", - Replicas: map[int]*store.ServerReplica{ - 0: gsr(0, 200, []string{"sklearn"}, "server2", true, false), // expect schedule here - 1: gsr(1, 200, []string{"sklearn"}, "server2", true, false), - }, - Shared: true, - ExpectedReplicas: -1, - }, - }, scheduled: true, setupMock: func(m *mock.MockModelServerAPI) { m.EXPECT().LockModel("model1") @@ -657,25 +511,7 @@ func TestScheduler(t *testing.T) { }, { name: "Reschedule", - model: newTestModel("model1", 100, []string{"sklearn"}, 1, 0, 1, []int{0}, false, "server1", nil), modelName: "model1", - servers: []*store.ServerSnapshot{ - { - Name: "server1", - Replicas: map[int]*store.ServerReplica{0: gsr(0, 200, []string{"sklearn"}, "server1", true, false)}, - Shared: true, - ExpectedReplicas: 0, - }, - { - Name: "server2", - Replicas: map[int]*store.ServerReplica{ - 0: gsr(0, 200, []string{"sklearn"}, "server2", true, false), // expect schedule here - 1: gsr(1, 200, []string{"sklearn"}, "server2", true, false), - }, - Shared: true, - ExpectedReplicas: 1, - }, - }, scheduled: true, setupMock: func(m *mock.MockModelServerAPI) { m.EXPECT().LockModel("model1") @@ -742,16 +578,7 @@ func TestScheduler(t *testing.T) { }, { name: "DeletedServerFail", - model: newTestModel("model1", 100, []string{"sklearn"}, 1, 0, 1, []int{1}, false, "", nil), modelName: "model1", - servers: []*store.ServerSnapshot{ - { - Name: "server1", - Replicas: map[int]*store.ServerReplica{0: gsr(0, 200, []string{"sklearn"}, "server1", true, false)}, - Shared: true, - ExpectedReplicas: 0, - }, - }, scheduled: false, setupMock: func(m *mock.MockModelServerAPI) { m.EXPECT().LockModel("model1") @@ -803,19 +630,7 @@ func TestScheduler(t *testing.T) { }, { name: "Available memory sorting", - model: newTestModel("model1", 100, []string{"sklearn"}, 1, 0, 1, []int{1}, false, "", nil), modelName: "model1", - servers: []*store.ServerSnapshot{ - { - Name: "server2", - Replicas: map[int]*store.ServerReplica{ - 0: gsr(0, 150, []string{"sklearn"}, "server2", true, false), - 1: gsr(1, 200, []string{"sklearn"}, "server2", true, false), // expect schedule here - }, - Shared: true, - ExpectedReplicas: -1, - }, - }, scheduled: true, setupMock: func(m *mock.MockModelServerAPI) { m.EXPECT().LockModel("model1") @@ -867,20 +682,7 @@ func TestScheduler(t *testing.T) { }, { name: "Available memory sorting with multiple replicas", - model: newTestModel("model1", 100, []string{"sklearn"}, 2, 0, 1, []int{1}, false, "", nil), modelName: "model1", - servers: []*store.ServerSnapshot{ - { - Name: "server2", - Replicas: map[int]*store.ServerReplica{ - 0: gsr(0, 150, []string{"sklearn"}, "server2", true, false), - 1: gsr(1, 200, []string{"sklearn"}, "server2", true, false), // expect schedule here - 2: gsr(2, 175, []string{"sklearn"}, "server2", true, false), // expect schedule here - }, - Shared: true, - ExpectedReplicas: -1, - }, - }, scheduled: true, setupMock: func(m *mock.MockModelServerAPI) { m.EXPECT().LockModel("model1") @@ -934,21 +736,7 @@ func TestScheduler(t *testing.T) { }, { name: "Scale up", - model: newTestModel("model1", 100, []string{"sklearn"}, 3, 0, 3, []int{1, 2}, false, "server1", nil), modelName: "model1", - servers: []*store.ServerSnapshot{ - { - Name: "server1", - Replicas: map[int]*store.ServerReplica{ - 0: gsr(0, 50, []string{"sklearn"}, "server1", true, false), - 1: gsr(1, 200, []string{"sklearn"}, "server1", true, false), // expect schedule here - nop - 2: gsr(2, 175, []string{"sklearn"}, "server1", true, false), // expect schedule here - nop - 3: gsr(3, 100, []string{"sklearn"}, "server1", true, false), // expect schedule here - }, - Shared: true, - ExpectedReplicas: -1, - }, - }, scheduled: true, setupMock: func(m *mock.MockModelServerAPI) { m.EXPECT().LockModel("model1") @@ -1015,21 +803,7 @@ func TestScheduler(t *testing.T) { }, { name: "Scale down", - model: newTestModel("model1", 100, []string{"sklearn"}, 1, 0, 1, []int{1, 2}, false, "server1", nil), modelName: "model1", - servers: []*store.ServerSnapshot{ - { - Name: "server1", - Replicas: map[int]*store.ServerReplica{ - 0: gsr(0, 50, []string{"sklearn"}, "server1", true, false), - 1: gsr(1, 200, []string{"sklearn"}, "server1", true, false), // expect schedule here - nop - 2: gsr(2, 175, []string{"sklearn"}, "server1", true, false), // expect schedule here - nop - 3: gsr(3, 100, []string{"sklearn"}, "server1", true, false), - }, - Shared: true, - ExpectedReplicas: -1, - }, - }, scheduled: true, setupMock: func(m *mock.MockModelServerAPI) { m.EXPECT().LockModel("model1") @@ -1093,22 +867,8 @@ func TestScheduler(t *testing.T) { }, }, { - name: "Scale up - not enough replicas use max of the server", - model: newTestModel("model1", 100, []string{"sklearn"}, 5, 3, 5, []int{1, 2}, false, "server1", nil), - modelName: "model1", - servers: []*store.ServerSnapshot{ - { - Name: "server1", - Replicas: map[int]*store.ServerReplica{ - 0: gsr(0, 100, []string{"sklearn"}, "server1", true, false), // expect schedule here - 1: gsr(1, 100, []string{"sklearn"}, "server1", true, false), // expect schedule here - nop - 2: gsr(2, 100, []string{"sklearn"}, "server1", true, false), // expect schedule here - nop - 3: gsr(3, 100, []string{"sklearn"}, "server1", true, false), // expect schedule here - }, - Shared: true, - ExpectedReplicas: -1, - }, - }, + name: "Scale up - not enough replicas use max of the server", + modelName: "model1", scheduled: true, // note that we are still trying to make the model as Available expectedServerEvents: 1, setupMock: func(m *mock.MockModelServerAPI) { @@ -1178,21 +938,7 @@ func TestScheduler(t *testing.T) { }, { name: "Scale up - no capacity on loaded replica servers, should still go there", - model: newTestModel("model1", 100, []string{"sklearn"}, 3, 0, 3, []int{1, 2}, false, "server1", nil), modelName: "model1", - servers: []*store.ServerSnapshot{ - { - Name: "server1", - Replicas: map[int]*store.ServerReplica{ - 0: gsr(0, 50, []string{"sklearn"}, "server1", true, false), - 1: gsr(1, 0, []string{"sklearn"}, "server1", true, false), // expect schedule here - nop - 2: gsr(2, 0, []string{"sklearn"}, "server1", true, false), // expect schedule here - nop - 3: gsr(3, 100, []string{"sklearn"}, "server1", true, false), // expect schedule here - }, - Shared: true, - ExpectedReplicas: -1, - }, - }, scheduled: true, setupMock: func(m *mock.MockModelServerAPI) { m.EXPECT().LockModel("model1") @@ -1260,21 +1006,8 @@ func TestScheduler(t *testing.T) { }, { name: "Scale down - no capacity on loaded replica servers, should still go there", - model: newTestModel("model1", 100, []string{"sklearn"}, 1, 0, 1, []int{1, 2}, false, "server1", nil), modelName: "model1", - servers: []*store.ServerSnapshot{ - { - Name: "server1", - Replicas: map[int]*store.ServerReplica{ - 0: gsr(0, 50, []string{"sklearn"}, "server1", true, false), - 1: gsr(1, 0, []string{"sklearn"}, "server1", true, false), // expect schedule here - nop - 2: gsr(2, 0, []string{"sklearn"}, "server1", true, false), // expect schedule here - nop - 3: gsr(3, 100, []string{"sklearn"}, "server1", true, false), - }, - Shared: true, - ExpectedReplicas: -1, - }, - }, + scheduled: true, setupMock: func(m *mock.MockModelServerAPI) { m.EXPECT().LockModel("model1") @@ -1342,21 +1075,7 @@ func TestScheduler(t *testing.T) { }, { name: "Drain", - model: newTestModel("model1", 100, []string{"sklearn"}, 2, 0, 2, []int{1}, false, "server1", []int{2}), modelName: "model1", - servers: []*store.ServerSnapshot{ - { - Name: "server1", - Replicas: map[int]*store.ServerReplica{ - 0: gsr(0, 50, []string{"sklearn"}, "server1", true, false), - 1: gsr(1, 200, []string{"sklearn"}, "server1", true, false), // expect schedule here - nop - 2: gsr(2, 175, []string{"sklearn"}, "server1", true, true), // drain - should not be returned - 3: gsr(3, 100, []string{"sklearn"}, "server1", true, false), // expect schedule here new replica - }, - Shared: true, - ExpectedReplicas: -1, - }, - }, scheduled: true, setupMock: func(m *mock.MockModelServerAPI) { m.EXPECT().LockModel("model1") @@ -1462,12 +1181,6 @@ func TestFailedModels(t *testing.T) { logger := log.New() g := NewGomegaWithT(t) - type modelStateWithMetadata struct { - state store.ModelState - deploymentSpec *pb.DeploymentSpec - availableReplicas uint32 - } - type test struct { name string models []*db.Model @@ -1658,7 +1371,6 @@ func TestFailedModels(t *testing.T) { mockModelServerAPI := mock.NewMockModelServerAPI(ctrl) test.setupMock(mockModelServerAPI) - //mockStore := newMockStore(test.models) scheduler := NewSimpleScheduler(logger, mockModelServerAPI, DefaultSchedulerConfig(mockModelServerAPI), synchroniser.NewSimpleSynchroniser(10*time.Millisecond), eventHub) failedModels, err := scheduler.getFailedModels() g.Expect(err).To(BeNil()) @@ -1669,85 +1381,135 @@ func TestFailedModels(t *testing.T) { } } +// todo: assert on how many version were removed func TestRemoveAllVersions(t *testing.T) { logger := log.New() g := NewGomegaWithT(t) - newTestModel := func(name string, requirements []string, loadedModels []int, scheduledServer string, numVersions int) *store.ModelSnapshot { - config := &pb.Model{Meta: &pb.MetaData{Name: t.Name()}, ModelSpec: &pb.ModelSpec{Requirements: requirements}} - rmap := make(map[int]store.ReplicaStatus) - for _, ridx := range loadedModels { - rmap[ridx] = store.ReplicaStatus{State: store.Loaded} - } - - versions := []*store.ModelVersion{} - for i := 1; i <= numVersions; i++ { - versions = append(versions, store.NewModelVersion(config, uint32(i), scheduledServer, rmap, false, store.ModelAvailable)) - } - // load a bad version - this should not get unloaded by the test - versions = append(versions, store.NewModelVersion(config, uint32(numVersions+1), scheduledServer, map[int]store.ReplicaStatus{}, false, store.ScheduleFailed)) - - return &store.ModelSnapshot{ - Name: name, - Versions: versions, - Deleted: true, - } - } - - gsr := func(replicaIdx int, availableMemory uint64, capabilities []string, serverName string) *store.ServerReplica { - replica := store.NewServerReplica("svc", 8080, 5001, replicaIdx, store.NewServer(serverName, true), capabilities, availableMemory, availableMemory, 0, nil, 100) - return replica - } - - //newMockStore := func(model *store.ModelSnapshot, servers []*store.ServerSnapshot) *mockStore { - // modelMap := make(map[string]*store.ModelSnapshot) - // modelMap[model.Name] = model - // return &mockStore{ - // models: modelMap, - // servers: servers, - // unloadedModels: make(map[string]uint32), - // } - //} - type test struct { - name string - model *store.ModelSnapshot - servers []*store.ServerSnapshot - numVersions int + name string + modelName string + setupMock func(m *mock.MockModelServerAPI) } tests := []test{ { - name: "Allversions - 1", - model: newTestModel("model1", []string{"sklearn"}, []int{0, 1}, "server", 1), - servers: []*store.ServerSnapshot{ - { - Name: "server2", - Replicas: map[int]*store.ServerReplica{ - 0: gsr(0, 200, []string{"sklearn"}, "server"), - 1: gsr(1, 200, []string{"sklearn"}, "server"), + name: "Allversions - 1", + modelName: "model1", + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().LockModel("model1") + m.EXPECT().GetModel("model1").Return(&db.Model{Name: "model1", Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "model1"}, + ModelSpec: &pbs.ModelSpec{ + Uri: "", + ArtifactVersion: nil, + StorageConfig: nil, + Requirements: []string{"sklearn"}, + MemoryBytes: nil, + Server: nil, + Parameters: nil, + ModelRuntimeInfo: nil, + ModelSpec: nil, + }, + DeploymentSpec: &pbs.DeploymentSpec{}, + }, + 1, "server2", + map[int32]*db.ReplicaStatus{ + + 0: { + State: db.ModelReplicaState_Loaded, + Reason: "", + Timestamp: nil, + }, + 1: { + State: db.ModelReplicaState_Loaded, + Reason: "", + Timestamp: nil, + }, + }, + db.ModelState_ModelAvailable), + }}, nil).MinTimes(1) + m.EXPECT().UnlockModel("model1") + servers := []*db.Server{ + { + Name: "server2", + Replicas: map[int32]*db.ServerReplica{ + 0: util.NewTestServerReplica("host1", 8080, 5000, 0, store.NewServer("server2", true), []string{"sklearn"}, 0, 200, 0, nil, 100), + 1: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server2", true), []string{"sklearn"}, 0, 200, 0, nil, 100), + }, + Shared: true, + ExpectedReplicas: -1, + KubernetesMeta: nil, }, - Shared: true, - ExpectedReplicas: -1, - }, + } + + expectedUpdatedServers := make([]*db.ServerReplica, 0) + m.EXPECT().GetServers().Return( + servers, + nil, + ) + m.EXPECT().UpdateLoadedModels("model1", uint32(1), + "server2", expectedUpdatedServers).Return(nil) }, - numVersions: 1, }, { - name: "Allversions - > 1", - model: newTestModel("model1", []string{"sklearn"}, []int{0, 1}, "server", 10), - servers: []*store.ServerSnapshot{ - { - Name: "server", - Replicas: map[int]*store.ServerReplica{ - 0: gsr(0, 200, []string{"sklearn"}, "server"), - 1: gsr(1, 200, []string{"sklearn"}, "server"), + name: "Allversions - > 1", + modelName: "model1", + setupMock: func(m *mock.MockModelServerAPI) { + m.EXPECT().LockModel("model1") + + versions := make([]*db.ModelVersion, 0, 10) + + for i := 0; i < 10; i++ { + versions = append(versions, + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "model1"}, + ModelSpec: &pbs.ModelSpec{ + Uri: "", + Requirements: []string{"sklearn"}, + }, + DeploymentSpec: &pbs.DeploymentSpec{}, + }, + 1, "server2", + map[int32]*db.ReplicaStatus{ + 0: { + State: db.ModelReplicaState_Loaded, + }, + 1: { + State: db.ModelReplicaState_Loaded, + }, + }, + db.ModelState_ModelAvailable, + ), + ) + } + + m.EXPECT().GetModel("model1").Return(&db.Model{Name: "model1", Versions: versions}, nil).MinTimes(1) + m.EXPECT().UnlockModel("model1") + servers := []*db.Server{ + { + Name: "server2", + Replicas: map[int32]*db.ServerReplica{ + 0: util.NewTestServerReplica("host1", 8080, 5000, 0, store.NewServer("server2", true), []string{"sklearn"}, 0, 200, 0, nil, 100), + 1: util.NewTestServerReplica("host1", 8080, 5000, 1, store.NewServer("server2", true), []string{"sklearn"}, 0, 200, 0, nil, 100), + }, + Shared: true, + ExpectedReplicas: -1, + KubernetesMeta: nil, }, - Shared: true, - ExpectedReplicas: -1, - }, + } + + expectedUpdatedServers := make([]*db.ServerReplica, 0) + m.EXPECT().GetServers().Return( + servers, + nil, + ) + m.EXPECT().UpdateLoadedModels("model1", uint32(1), + "server2", expectedUpdatedServers).Return(nil) }, - numVersions: 10, }, } @@ -1757,14 +1519,12 @@ func TestRemoveAllVersions(t *testing.T) { ctrl := gomock.NewController(t) mockModelServerAPI := mock.NewMockModelServerAPI(ctrl) - //test.setupMock(mockModelServerAPI) + test.setupMock(mockModelServerAPI) - //mockStore := newMockStore(test.model, test.servers) scheduler := NewSimpleScheduler(logger, mockModelServerAPI, DefaultSchedulerConfig(mockModelServerAPI), synchroniser.NewSimpleSynchroniser(time.Duration(10*time.Millisecond)), nil) - err := scheduler.Schedule(test.model.Name) + err := scheduler.Schedule(test.modelName) g.Expect(err).To(BeNil()) - //g.Expect(mockStore.unloadedModels[test.model.Name]).To(Equal(uint32(test.numVersions))) }) } } @@ -1784,49 +1544,55 @@ func TestScheduleFailedModels(t *testing.T) { setupMocks: func(ms *mock.MockModelServerAPI, sync *mock2.MockSynchroniser) { sync.EXPECT().IsReady().Return(true) - model1 := &store.ModelSnapshot{ + model1 := &db.Model{ Name: "model1", - Versions: []*store.ModelVersion{store.NewModelVersion(&pb.Model{ - Meta: &pb.MetaData{ - Name: "model1", - Kind: nil, - Version: nil, - KubernetesMeta: nil, - }, - ModelSpec: &pb.ModelSpec{ - Uri: "", - ArtifactVersion: nil, - StorageConfig: nil, - Requirements: nil, - MemoryBytes: nil, - Server: ptr.String("server1"), - Parameters: nil, - ModelRuntimeInfo: nil, - ModelSpec: nil, - }, - DeploymentSpec: &pb.DeploymentSpec{ - Replicas: 1, - MinReplicas: 0, - MaxReplicas: 0, - LogPayloads: false, - }, - StreamSpec: nil, - DataflowSpec: nil, - }, 1, "server1", map[int]store.ReplicaStatus{}, false, store.ScheduleFailed)}, + Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "model1"}, + ModelSpec: &pbs.ModelSpec{ + Uri: "", + ArtifactVersion: nil, + StorageConfig: nil, + Requirements: nil, + MemoryBytes: nil, + Server: ptr.String("server1"), + Parameters: nil, + ModelRuntimeInfo: nil, + ModelSpec: nil, + }, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 1, MinReplicas: 0, MaxReplicas: 0}, + }, + 1, "server1", + nil, + db.ModelState_ScheduleFailed)}, } - ms.EXPECT().GetModels().Return([]*store.ModelSnapshot{model1}, nil) + ms.EXPECT().GetModels().Return([]*db.Model{model1}, nil) ms.EXPECT().LockModel("model1") ms.EXPECT().UnlockModel("model1") ms.EXPECT().GetModel("model1").Return(model1, nil) - servers := []*store.ServerSnapshot{ - createServerSnapshot("server1", 1, 16000), + servers := []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: util.NewTestServerReplica("host1", 8080, 5000, 0, store.NewServer("server1", true), []string{"sklearn"}, 0, 16000, 0, nil, 100), + }, + Shared: true, + ExpectedReplicas: -1, + KubernetesMeta: nil, + }, } + + expectedServerUpdate := []*db.ServerReplica{ + util.NewTestServerReplica("host1", 8080, 5000, 0, store.NewServer("server1", true), []string{"sklearn"}, 0, 16000, 0, nil, 100), + } + ms.EXPECT().GetServers().Return(servers, nil) ms.EXPECT().UpdateLoadedModels("model1", uint32(1), - "server1", slices.Collect(maps.Values(servers[0].Replicas))).Return(nil) + "server1", expectedServerUpdate).Return(nil) }, expectedModels: []string{"model1"}, expectError: false, @@ -1836,81 +1602,79 @@ func TestScheduleFailedModels(t *testing.T) { setupMocks: func(ms *mock.MockModelServerAPI, sync *mock2.MockSynchroniser) { sync.EXPECT().IsReady().Return(true) - model1 := &store.ModelSnapshot{ + model1 := &db.Model{ Name: "model1", - Versions: []*store.ModelVersion{store.NewModelVersion(&pb.Model{ - Meta: &pb.MetaData{ - Name: "model1", - Kind: nil, - Version: nil, - KubernetesMeta: nil, - }, - ModelSpec: &pb.ModelSpec{ - Uri: "", - ArtifactVersion: nil, - StorageConfig: nil, - Requirements: nil, - MemoryBytes: nil, - Server: ptr.String("server1"), - Parameters: nil, - ModelRuntimeInfo: nil, - ModelSpec: nil, - }, - DeploymentSpec: &pb.DeploymentSpec{ - Replicas: 1, - MinReplicas: 0, - MaxReplicas: 0, - LogPayloads: false, - }, - StreamSpec: nil, - DataflowSpec: nil, - }, 1, "server1", map[int]store.ReplicaStatus{}, false, store.ScheduleFailed)}, + Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "model1"}, + ModelSpec: &pbs.ModelSpec{ + Uri: "", + ArtifactVersion: nil, + StorageConfig: nil, + Requirements: nil, + MemoryBytes: nil, + Server: ptr.String("server1"), + Parameters: nil, + ModelRuntimeInfo: nil, + ModelSpec: nil, + }, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 1, MinReplicas: 0, MaxReplicas: 0}, + }, + 1, "server1", + nil, + db.ModelState_ScheduleFailed)}, } - model2 := &store.ModelSnapshot{ + model2 := &db.Model{ Name: "model2", - Versions: []*store.ModelVersion{store.NewModelVersion(&pb.Model{ - Meta: &pb.MetaData{ - Name: "model2", - Kind: nil, - Version: nil, - KubernetesMeta: nil, - }, - ModelSpec: &pb.ModelSpec{ - Uri: "", - ArtifactVersion: nil, - StorageConfig: nil, - Requirements: nil, - MemoryBytes: nil, - Server: ptr.String("server1"), - Parameters: nil, - ModelRuntimeInfo: nil, - ModelSpec: nil, - }, - DeploymentSpec: &pb.DeploymentSpec{ - Replicas: 1, - MinReplicas: 0, - MaxReplicas: 0, - LogPayloads: false, - }, - StreamSpec: nil, - DataflowSpec: nil, - }, 1, "server1", map[int]store.ReplicaStatus{}, false, store.ScheduleFailed)}, + Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "model2"}, + ModelSpec: &pbs.ModelSpec{ + Uri: "", + ArtifactVersion: nil, + StorageConfig: nil, + Requirements: nil, + MemoryBytes: nil, + Server: ptr.String("server1"), + Parameters: nil, + ModelRuntimeInfo: nil, + ModelSpec: nil, + }, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 1, MinReplicas: 0, MaxReplicas: 0}, + }, + 1, "server1", + nil, + db.ModelState_ScheduleFailed)}, } - ms.EXPECT().GetModels().Return([]*store.ModelSnapshot{model1, model2}, nil) + ms.EXPECT().GetModels().Return([]*db.Model{model1, model2}, nil) // model1 ms.EXPECT().LockModel("model1") ms.EXPECT().UnlockModel("model1") ms.EXPECT().GetModel("model1").Return(model1, nil) - servers := []*store.ServerSnapshot{ - createServerSnapshot("server1", 1, 16000), + servers := []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: util.NewTestServerReplica("host1", 8080, 5000, 0, store.NewServer("server1", true), []string{"sklearn"}, 0, 16000, 0, nil, 100), + }, + Shared: true, + ExpectedReplicas: -1, + KubernetesMeta: nil, + }, + } + + expectedUpdatedServers := []*db.ServerReplica{ + util.NewTestServerReplica("host1", 8080, 5000, 0, store.NewServer("server1", true), []string{"sklearn"}, 0, 16000, 0, nil, 100), } ms.EXPECT().GetServers().Return(servers, nil) ms.EXPECT().UpdateLoadedModels("model1", uint32(1), - "server1", slices.Collect(maps.Values(servers[0].Replicas))).Return(nil) + "server1", expectedUpdatedServers).Return(nil) // model2 @@ -1920,7 +1684,7 @@ func TestScheduleFailedModels(t *testing.T) { ms.EXPECT().GetServers().Return(servers, nil) ms.EXPECT().UpdateLoadedModels("model2", uint32(1), - "server1", slices.Collect(maps.Values(servers[0].Replicas))).Return(nil) + "server1", expectedUpdatedServers).Return(nil) }, expectedModels: []string{"model1", "model2"}, expectError: false, @@ -1930,47 +1694,48 @@ func TestScheduleFailedModels(t *testing.T) { setupMocks: func(ms *mock.MockModelServerAPI, sync *mock2.MockSynchroniser) { sync.EXPECT().IsReady().Return(true) - model1 := &store.ModelSnapshot{ + model11 := &db.Model{ Name: "model1", - Versions: []*store.ModelVersion{store.NewModelVersion(&pb.Model{ - Meta: &pb.MetaData{ - Name: "model1", - Kind: nil, - Version: nil, - KubernetesMeta: nil, - }, - ModelSpec: &pb.ModelSpec{ - Uri: "", - ArtifactVersion: nil, - StorageConfig: nil, - Requirements: nil, - MemoryBytes: nil, - Server: ptr.String("server1"), - Parameters: nil, - ModelRuntimeInfo: nil, - ModelSpec: nil, - }, - DeploymentSpec: &pb.DeploymentSpec{ - Replicas: 3, - MinReplicas: 2, - MaxReplicas: 0, - LogPayloads: false, - }, - StreamSpec: nil, - DataflowSpec: nil, - }, 1, "server1", map[int]store.ReplicaStatus{}, false, store.ScheduleFailed)}, + Versions: []*db.ModelVersion{ + util.NewTestModelVersion( + &pbs.Model{ + Meta: &pbs.MetaData{Name: "model1"}, + ModelSpec: &pbs.ModelSpec{ + Uri: "", + ArtifactVersion: nil, + StorageConfig: nil, + Requirements: nil, + MemoryBytes: nil, + Server: ptr.String("server1"), + Parameters: nil, + ModelRuntimeInfo: nil, + ModelSpec: nil, + }, + DeploymentSpec: &pbs.DeploymentSpec{Replicas: 3, MinReplicas: 2, MaxReplicas: 0}, + }, + 1, "server1", + nil, + db.ModelState_ScheduleFailed)}, } - ms.EXPECT().GetModels().Return([]*store.ModelSnapshot{model1}, nil) + ms.EXPECT().GetModels().Return([]*db.Model{model11}, nil) ms.EXPECT().LockModel("model1") ms.EXPECT().UnlockModel("model1") - ms.EXPECT().GetModel("model1").Return(model1, nil) + ms.EXPECT().GetModel("model1").Return(model11, nil) - servers := []*store.ServerSnapshot{ - createServerSnapshot("server1", 1, 16000), + serverss := []*db.Server{ + { + Name: "server1", + Replicas: map[int32]*db.ServerReplica{ + 0: util.NewTestServerReplica("host1", 8080, 5000, 0, store.NewServer("server1", true), []string{"sklearn"}, 0, 16000, 0, nil, 100), + }, + Shared: true, + ExpectedReplicas: -1, + KubernetesMeta: nil, + }, } - ms.EXPECT().GetServers().Return(servers, nil) + ms.EXPECT().GetServers().Return(serverss, nil) ms.EXPECT().FailedScheduling("model1", uint32(1), "Failed to schedule model as no matching server had enough suitable replicas", true).Return(nil) }, @@ -1992,18 +1757,18 @@ func TestScheduleFailedModels(t *testing.T) { t.Parallel() ctrl := gomock.NewController(t) - mockStore := mock.NewMockModelServerAPI(ctrl) + mockModelServerAPI := mock.NewMockModelServerAPI(ctrl) mockSync := mock2.NewMockSynchroniser(ctrl) - tt.setupMocks(mockStore, mockSync) + tt.setupMocks(mockModelServerAPI, mockSync) eventHub, err := coordinator.NewEventHub(log.New()) require.NoError(t, err) scheduler := NewSimpleScheduler( log.New(), - mockStore, - DefaultSchedulerConfig(mockStore), + mockModelServerAPI, + DefaultSchedulerConfig(mockModelServerAPI), mockSync, eventHub) @@ -2022,20 +1787,3 @@ func TestScheduleFailedModels(t *testing.T) { }) } } - -func createServerSnapshot(name string, numReplicas int, availableMemory uint64) *store.ServerSnapshot { - replicas := make(map[int]*store.ServerReplica, numReplicas) - server := store.NewServer(name, false) - - for i := 0; i < numReplicas; i++ { - replicas[i] = store.NewServerReplica(name+"-svc", - 4000, 5000, i, server, nil, - availableMemory, availableMemory, availableMemory, nil, 0) - } - - return &store.ServerSnapshot{ - Name: name, - Replicas: replicas, - ExpectedReplicas: numReplicas, - } -} From 6ebeac0fd94b2d9488233071b59ad9e51872536d Mon Sep 17 00:00:00 2001 From: Dominic Riordan Date: Wed, 14 Jan 2026 17:13:25 +0000 Subject: [PATCH 50/50] remove dead code and add tests for new storage and proto helpers --- apis/go/go.mod | 11 +- apis/go/go.sum | 38 +- apis/go/mlops/scheduler/db/model_ext_test.go | 1306 +++++++++++++++++ apis/go/mlops/scheduler/db/server_ext_test.go | 578 ++++++++ hodometer/go.mod | 15 +- hodometer/go.sum | 142 +- operator/go.mod | 18 +- operator/go.sum | 54 +- scheduler/cmd/scheduler/main.go | 2 +- scheduler/go.mod | 18 +- scheduler/go.sum | 46 +- scheduler/pkg/agent/server.go | 2 +- scheduler/pkg/agent/server_test.go | 22 +- scheduler/pkg/envoy/processor/incremental.go | 5 +- .../processor/incremental_benchmark_test.go | 2 +- .../pkg/envoy/processor/incremental_test.go | 2 +- scheduler/pkg/envoy/processor/server_test.go | 3 +- scheduler/pkg/envoy/xdscache/seldoncache.go | 2 +- .../pkg/envoy/xdscache/seldoncache_test.go | 2 +- .../conflict_resolution.go | 2 +- scheduler/pkg/kafka/dataflow/server_test.go | 2 +- scheduler/pkg/scheduler/cleaner/versions.go | 3 +- .../scheduler/filters/deletedserver_test.go | 5 +- .../scheduler/filters/replicadraining_test.go | 1 + .../scheduler/filters/replicamemory_test.go | 5 +- .../scheduler/filters/serverreplicas_test.go | 5 +- .../filters/serverrequirements_test.go | 5 +- .../pkg/scheduler/filters/sharing_test.go | 5 +- scheduler/pkg/scheduler/mock/interface.go | 9 + scheduler/pkg/scheduler/scheduler.go | 3 +- scheduler/pkg/scheduler/scheduler_test.go | 5 +- .../pkg/scheduler/sorters/loaded_test.go | 2 + .../scheduler/sorters/replicaindex_test.go | 2 + .../scheduler/sorters/replicamemory_test.go | 2 + scheduler/pkg/server/control_plane_test.go | 2 +- scheduler/pkg/server/server.go | 9 +- scheduler/pkg/server/server_status.go | 2 +- scheduler/pkg/server/server_status_test.go | 4 +- scheduler/pkg/server/server_test.go | 2 +- scheduler/pkg/server/utils.go | 3 +- scheduler/pkg/server/utils_test.go | 1 + scheduler/pkg/store/api.go | 118 +- scheduler/pkg/store/experiment/db.go | 3 +- scheduler/pkg/store/experiment/state.go | 1 + scheduler/pkg/store/experiment/state_test.go | 4 +- scheduler/pkg/store/experiment/store.go | 3 +- scheduler/pkg/store/experiment/store_test.go | 3 +- scheduler/pkg/store/experiment/utils_test.go | 1 + scheduler/pkg/store/in_memory_store.go | 3 +- scheduler/pkg/store/in_memory_store_test.go | 546 +++++++ scheduler/pkg/store/memory.go | 15 +- scheduler/pkg/store/memory_status.go | 4 +- scheduler/pkg/store/memory_test.go | 2 +- scheduler/pkg/store/mesh.go | 789 +--------- scheduler/pkg/store/mesh_test.go | 396 ----- scheduler/pkg/store/mock/store.go | 9 + .../pkg/store/modelreplicastate_string.go | 44 - scheduler/pkg/store/modelstate_string.go | 42 - scheduler/pkg/store/pipeline/db_test.go | 6 +- scheduler/pkg/store/pipeline/status.go | 3 +- scheduler/pkg/store/pipeline/store.go | 2 +- scheduler/pkg/store/pipeline/store_test.go | 4 +- scheduler/pkg/store/pipeline/utils.go | 2 +- scheduler/pkg/store/test_memory_hack.go | 3 +- scheduler/pkg/util/model_versions.go | 9 + scheduler/pkg/util/server_replica.go | 9 + 66 files changed, 2722 insertions(+), 1646 deletions(-) create mode 100644 apis/go/mlops/scheduler/db/model_ext_test.go create mode 100644 apis/go/mlops/scheduler/db/server_ext_test.go create mode 100644 scheduler/pkg/store/in_memory_store_test.go delete mode 100644 scheduler/pkg/store/modelreplicastate_string.go delete mode 100644 scheduler/pkg/store/modelstate_string.go diff --git a/apis/go/go.mod b/apis/go/go.mod index 8e214e16c2..064b9f70de 100644 --- a/apis/go/go.mod +++ b/apis/go/go.mod @@ -5,14 +5,17 @@ go 1.23.0 toolchain go1.24.4 require ( + github.com/onsi/gomega v1.39.0 google.golang.org/grpc v1.73.0 google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.5.1 - google.golang.org/protobuf v1.36.6 + google.golang.org/protobuf v1.36.7 ) require ( - golang.org/x/net v0.41.0 // indirect - golang.org/x/sys v0.33.0 // indirect - golang.org/x/text v0.26.0 // indirect + github.com/google/go-cmp v0.7.0 // indirect + go.yaml.in/yaml/v3 v3.0.4 // indirect + golang.org/x/net v0.43.0 // indirect + golang.org/x/sys v0.35.0 // indirect + golang.org/x/text v0.28.0 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20250603155806-513f23925822 // indirect ) diff --git a/apis/go/go.sum b/apis/go/go.sum index 32952115b3..14f8d037e3 100644 --- a/apis/go/go.sum +++ b/apis/go/go.sum @@ -1,13 +1,23 @@ -github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= -github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= +github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= +github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= +github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8= +github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/onsi/ginkgo/v2 v2.25.3 h1:Ty8+Yi/ayDAGtk4XxmmfUy4GabvM+MegeB4cDLRi6nw= +github.com/onsi/ginkgo/v2 v2.25.3/go.mod h1:43uiyQC4Ed2tkOzLsEYm7hnrb7UJTWHYNsuy3bG/snE= +github.com/onsi/gomega v1.39.0 h1:y2ROC3hKFmQZJNFeGAMeHZKkjBL65mIZcvrLQBF9k6Q= +github.com/onsi/gomega v1.39.0/go.mod h1:ZCU1pkQcXDO5Sl9/VVEGlDyp+zm0m1cmeG5TOzLgdh4= go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= go.opentelemetry.io/otel v1.35.0 h1:xKWKPxrxB6OtMCbmMY021CqC45J+3Onta9MqjhnusiQ= @@ -20,17 +30,25 @@ go.opentelemetry.io/otel/sdk/metric v1.35.0 h1:1RriWBmCKgkeHEhM7a2uMjMUfP7MsOF5J go.opentelemetry.io/otel/sdk/metric v1.35.0/go.mod h1:is6XYCUMpcKi+ZsOvfluY5YstFnhW0BidkR+gL+qN+w= go.opentelemetry.io/otel/trace v1.35.0 h1:dPpEfJu1sDIqruz7BHFG3c7528f6ddfSWfFDVt/xgMs= go.opentelemetry.io/otel/trace v1.35.0/go.mod h1:WUk7DtFp1Aw2MkvqGdwiXYDZZNvA/1J8o6xRXLrIkyc= -golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw= -golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA= -golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= -golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/text v0.26.0 h1:P42AVeLghgTYr4+xUnTRKDMqpar+PtX7KWuNQL21L8M= -golang.org/x/text v0.26.0/go.mod h1:QK15LZJUUQVJxhz7wXgxSy/CJaTFjd0G+YLonydOVQA= +go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs= +go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8= +go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= +golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= +golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= +golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= +golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng= +golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU= +golang.org/x/tools v0.36.0 h1:kWS0uv/zsvHEle1LbV5LE8QujrxB3wfQyxHfhOk0Qkg= +golang.org/x/tools v0.36.0/go.mod h1:WBDiHKJK8YgLHlcQPYQzNCkUxUypCaa5ZegCVutKm+s= google.golang.org/genproto/googleapis/rpc v0.0.0-20250603155806-513f23925822 h1:fc6jSaCT0vBduLYZHYrBBNY4dsWuvgyff9noRNDdBeE= google.golang.org/genproto/googleapis/rpc v0.0.0-20250603155806-513f23925822/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A= google.golang.org/grpc v1.73.0 h1:VIWSmpI2MegBtTuFt5/JWy2oXxtjJ/e89Z70ImfD2ok= google.golang.org/grpc v1.73.0/go.mod h1:50sbHOUqWoCQGI8V2HQLJM0B+LMlIUjNSZmow7EVBQc= google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.5.1 h1:F29+wU6Ee6qgu9TddPgooOdaqsxTMunOoj8KA5yuS5A= google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.5.1/go.mod h1:5KF+wpkbTSbGcR9zteSqZV6fqFOWBl4Yde8En8MryZA= -google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= -google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +google.golang.org/protobuf v1.36.7 h1:IgrO7UwFQGJdRNXH/sQux4R1Dj1WAKcLElzeeRaXV2A= +google.golang.org/protobuf v1.36.7/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= diff --git a/apis/go/mlops/scheduler/db/model_ext_test.go b/apis/go/mlops/scheduler/db/model_ext_test.go new file mode 100644 index 0000000000..53bc7780ac --- /dev/null +++ b/apis/go/mlops/scheduler/db/model_ext_test.go @@ -0,0 +1,1306 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed BY +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + +package db + +import ( + "testing" + + . "github.com/onsi/gomega" + pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" +) + +func TestModelVersion_GetAssignment(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + mv *ModelVersion + expected []int + }{ + { + name: "loaded and available replicas", + mv: &ModelVersion{ + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Loaded}, + 1: {State: ModelReplicaState_Available}, + 2: {State: ModelReplicaState_Unloaded}, + }, + }, + expected: []int{0, 1}, + }, + { + name: "prefer non-draining over draining", + mv: &ModelVersion{ + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Loaded}, + 1: {State: ModelReplicaState_Draining}, + }, + }, + expected: []int{0}, + }, + { + name: "only draining replicas", + mv: &ModelVersion{ + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Draining}, + 1: {State: ModelReplicaState_Draining}, + }, + }, + expected: []int{0, 1}, + }, + { + name: "loaded unavailable replicas", + mv: &ModelVersion{ + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_LoadedUnavailable}, + 1: {State: ModelReplicaState_Unloaded}, + }, + }, + expected: []int{0}, + }, + { + name: "no available replicas", + mv: &ModelVersion{ + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Unloaded}, + 1: {State: ModelReplicaState_LoadFailed}, + }, + }, + expected: nil, + }, + { + name: "empty replicas", + mv: &ModelVersion{Replicas: map[int32]*ReplicaStatus{}}, + expected: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.mv.GetAssignment() + g.Expect(result).To(ConsistOf(tt.expected)) + }) + } +} + +func TestModelVersion_ReplicaState(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + mv *ModelVersion + expected map[int]*ReplicaStatus + }{ + { + name: "multiple replicas", + mv: &ModelVersion{ + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Loaded}, + 1: {State: ModelReplicaState_Available}, + }, + }, + expected: map[int]*ReplicaStatus{ + 0: {State: ModelReplicaState_Loaded}, + 1: {State: ModelReplicaState_Available}, + }, + }, + { + name: "empty replicas", + mv: &ModelVersion{Replicas: map[int32]*ReplicaStatus{}}, + expected: map[int]*ReplicaStatus{}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.mv.ReplicaState() + g.Expect(result).To(HaveLen(len(tt.expected))) + for idx, status := range tt.expected { + g.Expect(result[idx].State).To(Equal(status.State)) + } + }) + } +} + +func TestModelVersion_IsLoadingOrLoaded(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + mv *ModelVersion + server string + replicaIdx int + expected bool + }{ + { + name: "loading on correct server", + mv: &ModelVersion{ + Server: "server1", + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Loading}, + }, + }, + server: "server1", + replicaIdx: 0, + expected: true, + }, + { + name: "loaded on correct server", + mv: &ModelVersion{ + Server: "server1", + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Loaded}, + }, + }, + server: "server1", + replicaIdx: 0, + expected: true, + }, + { + name: "wrong server", + mv: &ModelVersion{ + Server: "server1", + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Loaded}, + }, + }, + server: "server2", + replicaIdx: 0, + expected: false, + }, + { + name: "wrong replica index", + mv: &ModelVersion{ + Server: "server1", + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Loaded}, + }, + }, + server: "server1", + replicaIdx: 1, + expected: false, + }, + { + name: "unloaded state", + mv: &ModelVersion{ + Server: "server1", + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Unloaded}, + }, + }, + server: "server1", + replicaIdx: 0, + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.mv.IsLoadingOrLoaded(tt.server, tt.replicaIdx) + g.Expect(result).To(Equal(tt.expected)) + }) + } +} + +func TestModelVersion_DesiredReplicas(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + mv *ModelVersion + expected int + }{ + { + name: "3 replicas", + mv: &ModelVersion{ + ModelDefn: &pb.Model{ + DeploymentSpec: &pb.DeploymentSpec{ + Replicas: 3, + }, + }, + }, + expected: 3, + }, + { + name: "0 replicas", + mv: &ModelVersion{ + ModelDefn: &pb.Model{ + DeploymentSpec: &pb.DeploymentSpec{ + Replicas: 0, + }, + }, + }, + expected: 0, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.mv.DesiredReplicas() + g.Expect(result).To(Equal(tt.expected)) + }) + } +} + +func TestModelVersion_ModelName(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + mv *ModelVersion + expected string + }{ + { + name: "valid model name", + mv: &ModelVersion{ + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "test-model", + }, + }, + }, + expected: "test-model", + }, + { + name: "empty model name", + mv: &ModelVersion{ + ModelDefn: &pb.Model{ + Meta: &pb.MetaData{ + Name: "", + }, + }, + }, + expected: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.mv.ModelName() + g.Expect(result).To(Equal(tt.expected)) + }) + } +} + +func TestModelVersion_IsLoadingOrLoadedOnServer(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + mv *ModelVersion + expected bool + }{ + { + name: "has loading replica", + mv: &ModelVersion{ + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Loading}, + }, + }, + expected: true, + }, + { + name: "has loaded replica", + mv: &ModelVersion{ + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Loaded}, + }, + }, + expected: true, + }, + { + name: "has available replica", + mv: &ModelVersion{ + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Available}, + }, + }, + expected: true, + }, + { + name: "has loaded unavailable replica", + mv: &ModelVersion{ + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_LoadedUnavailable}, + }, + }, + expected: true, + }, + { + name: "only unloaded replicas", + mv: &ModelVersion{ + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Unloaded}, + }, + }, + expected: false, + }, + { + name: "empty replicas", + mv: &ModelVersion{Replicas: map[int32]*ReplicaStatus{}}, + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.mv.IsLoadingOrLoadedOnServer() + g.Expect(result).To(Equal(tt.expected)) + }) + } +} + +func TestModelVersion_GetReplicaForState(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + mv *ModelVersion + state ModelReplicaState + expected []int + }{ + { + name: "find loaded replicas", + mv: &ModelVersion{ + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Loaded}, + 1: {State: ModelReplicaState_Available}, + 2: {State: ModelReplicaState_Loaded}, + }, + }, + state: ModelReplicaState_Loaded, + expected: []int{0, 2}, + }, + { + name: "no replicas in state", + mv: &ModelVersion{ + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Loaded}, + 1: {State: ModelReplicaState_Available}, + }, + }, + state: ModelReplicaState_Unloaded, + expected: nil, + }, + { + name: "empty replicas", + mv: &ModelVersion{Replicas: map[int32]*ReplicaStatus{}}, + state: ModelReplicaState_Loaded, + expected: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.mv.GetReplicaForState(tt.state) + g.Expect(result).To(ConsistOf(tt.expected)) + }) + } +} + +func TestModelVersion_HasLiveReplicas(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + mv *ModelVersion + expected bool + }{ + { + name: "has loaded replica", + mv: &ModelVersion{ + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Loaded}, + }, + }, + expected: true, + }, + { + name: "has available replica", + mv: &ModelVersion{ + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Available}, + }, + }, + expected: true, + }, + { + name: "has draining replica", + mv: &ModelVersion{ + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Draining}, + }, + }, + expected: true, + }, + { + name: "only unloaded replicas", + mv: &ModelVersion{ + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Unloaded}, + }, + }, + expected: false, + }, + { + name: "empty replicas", + mv: &ModelVersion{Replicas: map[int32]*ReplicaStatus{}}, + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.mv.HasLiveReplicas() + g.Expect(result).To(Equal(tt.expected)) + }) + } +} + +func TestModelVersion_HasServer(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + mv *ModelVersion + expected bool + }{ + { + name: "has server", + mv: &ModelVersion{Server: "server1"}, + expected: true, + }, + { + name: "empty server", + mv: &ModelVersion{Server: ""}, + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.mv.HasServer() + g.Expect(result).To(Equal(tt.expected)) + }) + } +} + +func TestModelVersion_GetRequiredMemory(t *testing.T) { + g := NewWithT(t) + + uint64Ptr := func(v uint64) *uint64 { return &v } + + tests := []struct { + name string + mv *ModelVersion + expected uint64 + }{ + { + name: "basic memory without runtime info", + mv: &ModelVersion{ + ModelDefn: &pb.Model{ + ModelSpec: &pb.ModelSpec{ + MemoryBytes: uint64Ptr(1000), + }, + }, + }, + expected: 1000, + }, + { + name: "mlserver with parallel workers", + mv: &ModelVersion{ + ModelDefn: &pb.Model{ + ModelSpec: &pb.ModelSpec{ + MemoryBytes: uint64Ptr(1000), + ModelRuntimeInfo: &pb.ModelRuntimeInfo{ + ModelRuntimeInfo: &pb.ModelRuntimeInfo_Mlserver{ + Mlserver: &pb.MLServerModelSettings{ + ParallelWorkers: 3, + }, + }, + }, + }, + }, + }, + expected: 3000, + }, + { + name: "triton with instance count", + mv: &ModelVersion{ + ModelDefn: &pb.Model{ + ModelSpec: &pb.ModelSpec{ + MemoryBytes: uint64Ptr(1000), + ModelRuntimeInfo: &pb.ModelRuntimeInfo{ + ModelRuntimeInfo: &pb.ModelRuntimeInfo_Triton{ + Triton: &pb.TritonModelConfig{ + Cpu: []*pb.TritonCPU{ + {InstanceCount: 2}, + }, + }, + }, + }, + }, + }, + }, + expected: 2000, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.mv.GetRequiredMemory() + g.Expect(result).To(Equal(tt.expected)) + }) + } +} + +func TestModelVersion_SetReplicaState(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + mv *ModelVersion + replicaIdx int + state ModelReplicaState + reason string + expectedLen int + }{ + { + name: "set state on empty replicas", + mv: &ModelVersion{}, + replicaIdx: 0, + state: ModelReplicaState_Loaded, + reason: "test reason", + expectedLen: 1, + }, + { + name: "update existing replica state", + mv: &ModelVersion{ + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Loading}, + }, + }, + replicaIdx: 0, + state: ModelReplicaState_Loaded, + reason: "updated", + expectedLen: 1, + }, + { + name: "add new replica state", + mv: &ModelVersion{ + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Loaded}, + }, + }, + replicaIdx: 1, + state: ModelReplicaState_Loading, + reason: "new replica", + expectedLen: 2, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tt.mv.SetReplicaState(tt.replicaIdx, tt.state, tt.reason) + g.Expect(tt.mv.Replicas).To(HaveLen(tt.expectedLen)) + g.Expect(tt.mv.Replicas[int32(tt.replicaIdx)].State).To(Equal(tt.state)) + g.Expect(tt.mv.Replicas[int32(tt.replicaIdx)].Reason).To(Equal(tt.reason)) + g.Expect(tt.mv.Replicas[int32(tt.replicaIdx)].Timestamp).NotTo(BeNil()) + }) + } +} + +func TestModelVersion_GetModelReplicaState(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + mv *ModelVersion + replicaIdx int + expected ModelReplicaState + }{ + { + name: "existing replica", + mv: &ModelVersion{ + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Loaded}, + }, + }, + replicaIdx: 0, + expected: ModelReplicaState_Loaded, + }, + { + name: "non-existing replica", + mv: &ModelVersion{ + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Loaded}, + }, + }, + replicaIdx: 1, + expected: ModelReplicaState_ModelReplicaStateUnknown, + }, + { + name: "empty replicas", + mv: &ModelVersion{}, + replicaIdx: 0, + expected: ModelReplicaState_ModelReplicaStateUnknown, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.mv.GetModelReplicaState(tt.replicaIdx) + g.Expect(result).To(Equal(tt.expected)) + }) + } +} + +func TestModelVersion_Inactive(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + mv *ModelVersion + expected bool + }{ + { + name: "all replicas inactive", + mv: &ModelVersion{ + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Unloaded}, + 1: {State: ModelReplicaState_UnloadFailed}, + }, + }, + expected: true, + }, + { + name: "has active replica", + mv: &ModelVersion{ + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Loaded}, + 1: {State: ModelReplicaState_Unloaded}, + }, + }, + expected: false, + }, + { + name: "empty replicas", + mv: &ModelVersion{Replicas: map[int32]*ReplicaStatus{}}, + expected: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.mv.Inactive() + g.Expect(result).To(Equal(tt.expected)) + }) + } +} + +func TestModelVersion_DeleteReplica(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + mv *ModelVersion + replicaIdx int + expectedLen int + }{ + { + name: "delete existing replica", + mv: &ModelVersion{ + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Loaded}, + 1: {State: ModelReplicaState_Available}, + }, + }, + replicaIdx: 0, + expectedLen: 1, + }, + { + name: "delete non-existing replica", + mv: &ModelVersion{ + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Loaded}, + }, + }, + replicaIdx: 1, + expectedLen: 1, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tt.mv.DeleteReplica(tt.replicaIdx) + g.Expect(tt.mv.Replicas).To(HaveLen(tt.expectedLen)) + _, exists := tt.mv.Replicas[int32(tt.replicaIdx)] + g.Expect(exists).To(BeFalse()) + }) + } +} + +func TestModel_Latest(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + model *Model + expected *ModelVersion + }{ + { + name: "has versions", + model: &Model{ + Versions: []*ModelVersion{ + {Version: 1}, + {Version: 2}, + {Version: 3}, + }, + }, + expected: &ModelVersion{Version: 3}, + }, + { + name: "no versions", + model: &Model{Versions: []*ModelVersion{}}, + expected: nil, + }, + { + name: "nil versions", + model: &Model{}, + expected: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.model.Latest() + if tt.expected == nil { + g.Expect(result).To(BeNil()) + } else { + g.Expect(result.Version).To(Equal(tt.expected.Version)) + } + }) + } +} + +func TestModel_HasLatest(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + model *Model + expected bool + }{ + { + name: "has versions", + model: &Model{ + Versions: []*ModelVersion{ + {Version: 1}, + }, + }, + expected: true, + }, + { + name: "no versions", + model: &Model{Versions: []*ModelVersion{}}, + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.model.HasLatest() + g.Expect(result).To(Equal(tt.expected)) + }) + } +} + +func TestModel_GetVersion(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + model *Model + version uint32 + expected *ModelVersion + }{ + { + name: "version exists", + model: &Model{ + Versions: []*ModelVersion{ + {Version: 1}, + {Version: 2}, + {Version: 3}, + }, + }, + version: 2, + expected: &ModelVersion{Version: 2}, + }, + { + name: "version does not exist", + model: &Model{ + Versions: []*ModelVersion{ + {Version: 1}, + {Version: 2}, + }, + }, + version: 5, + expected: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.model.GetVersion(tt.version) + if tt.expected == nil { + g.Expect(result).To(BeNil()) + } else { + g.Expect(result.Version).To(Equal(tt.expected.Version)) + } + }) + } +} + +func TestModel_GetLastAvailableModel(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + model *Model + expected *ModelVersion + }{ + { + name: "has available version", + model: &Model{ + Versions: []*ModelVersion{ + { + Version: 1, + State: &ModelStatus{State: ModelState_ModelAvailable}, + }, + { + Version: 2, + State: &ModelStatus{State: ModelState_ModelProgressing}, + }, + { + Version: 3, + State: &ModelStatus{State: ModelState_ModelAvailable}, + }, + }, + }, + expected: &ModelVersion{Version: 3}, + }, + { + name: "no available version", + model: &Model{ + Versions: []*ModelVersion{ + { + Version: 1, + State: &ModelStatus{State: ModelState_ModelProgressing}, + }, + }, + }, + expected: nil, + }, + { + name: "nil model", + model: nil, + expected: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.model.GetLastAvailableModel() + if tt.expected == nil { + g.Expect(result).To(BeNil()) + } else { + g.Expect(result.Version).To(Equal(tt.expected.Version)) + } + }) + } +} + +func TestModel_CanReceiveTraffic(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + model *Model + expected bool + }{ + { + name: "has available model", + model: &Model{ + Versions: []*ModelVersion{ + { + Version: 1, + State: &ModelStatus{State: ModelState_ModelAvailable}, + }, + }, + }, + expected: true, + }, + { + name: "latest has live replicas", + model: &Model{ + Versions: []*ModelVersion{ + { + Version: 1, + State: &ModelStatus{State: ModelState_ModelProgressing}, + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Loaded}, + }, + }, + }, + }, + expected: true, + }, + { + name: "no traffic-ready versions", + model: &Model{ + Versions: []*ModelVersion{ + { + Version: 1, + State: &ModelStatus{State: ModelState_ModelProgressing}, + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Unloaded}, + }, + }, + }, + }, + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.model.CanReceiveTraffic() + g.Expect(result).To(Equal(tt.expected)) + }) + } +} + +func TestModel_GetVersionsBeforeLastAvailable(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + model *Model + expected []*ModelVersion + }{ + { + name: "has versions before last available", + model: &Model{ + Versions: []*ModelVersion{ + {Version: 1, State: &ModelStatus{State: ModelState_ModelAvailable}}, + {Version: 2, State: &ModelStatus{State: ModelState_ModelProgressing}}, + {Version: 3, State: &ModelStatus{State: ModelState_ModelAvailable}}, + {Version: 4, State: &ModelStatus{State: ModelState_ModelProgressing}}, + }, + }, + expected: []*ModelVersion{ + {Version: 1}, + {Version: 2}, + }, + }, + { + name: "no available version", + model: &Model{ + Versions: []*ModelVersion{ + {Version: 1, State: &ModelStatus{State: ModelState_ModelProgressing}}, + }, + }, + expected: nil, + }, + { + name: "first version is available", + model: &Model{ + Versions: []*ModelVersion{ + {Version: 1, State: &ModelStatus{State: ModelState_ModelAvailable}}, + {Version: 2, State: &ModelStatus{State: ModelState_ModelProgressing}}, + }, + }, + expected: []*ModelVersion{}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.model.GetVersionsBeforeLastAvailable() + if tt.expected == nil { + g.Expect(result).To(BeNil()) + } else { + g.Expect(result).To(HaveLen(len(tt.expected))) + } + }) + } +} + +func TestModel_GetVersionsBeforeLastModelGwAvailable(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + model *Model + expected []*ModelVersion + }{ + { + name: "has versions before last modelgw available", + model: &Model{ + Versions: []*ModelVersion{ + {Version: 1, State: &ModelStatus{ModelGwState: ModelState_ModelAvailable}}, + {Version: 2, State: &ModelStatus{ModelGwState: ModelState_ModelProgressing}}, + {Version: 3, State: &ModelStatus{ModelGwState: ModelState_ModelAvailable}}, + }, + }, + expected: []*ModelVersion{ + {Version: 1}, + {Version: 2}, + }, + }, + { + name: "no modelgw available version", + model: &Model{ + Versions: []*ModelVersion{ + {Version: 1, State: &ModelStatus{ModelGwState: ModelState_ModelProgressing}}, + }, + }, + expected: nil, + }, + { + name: "nil model", + model: nil, + expected: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.model.GetVersionsBeforeLastModelGwAvailable() + if tt.expected == nil { + g.Expect(result).To(BeNil()) + } else { + g.Expect(result).To(HaveLen(len(tt.expected))) + } + }) + } +} + +func TestModel_Inactive(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + model *Model + expected bool + }{ + { + name: "latest version inactive", + model: &Model{ + Versions: []*ModelVersion{ + { + Version: 1, + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Unloaded}, + }, + }, + }, + }, + expected: true, + }, + { + name: "latest version active", + model: &Model{ + Versions: []*ModelVersion{ + { + Version: 1, + Replicas: map[int32]*ReplicaStatus{ + 0: {State: ModelReplicaState_Loaded}, + }, + }, + }, + }, + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.model.Inactive() + g.Expect(result).To(Equal(tt.expected)) + }) + } +} + +func TestModel_GetLastAvailableModelVersion(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + model *Model + expected *ModelVersion + }{ + { + name: "has available version", + model: &Model{ + Versions: []*ModelVersion{ + {Version: 1, State: &ModelStatus{State: ModelState_ModelProgressing}}, + {Version: 2, State: &ModelStatus{State: ModelState_ModelAvailable}}, + {Version: 3, State: &ModelStatus{State: ModelState_ModelProgressing}}, + }, + }, + expected: &ModelVersion{Version: 2}, + }, + { + name: "no available version", + model: &Model{ + Versions: []*ModelVersion{ + {Version: 1, State: &ModelStatus{State: ModelState_ModelProgressing}}, + }, + }, + expected: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.model.GetLastAvailableModelVersion() + if tt.expected == nil { + g.Expect(result).To(BeNil()) + } else { + g.Expect(result.Version).To(Equal(tt.expected.Version)) + } + }) + } +} + +func TestModelReplicaState_CanReceiveTraffic(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + state ModelReplicaState + expected bool + }{ + {name: "Loaded", state: ModelReplicaState_Loaded, expected: true}, + {name: "Available", state: ModelReplicaState_Available, expected: true}, + {name: "LoadedUnavailable", state: ModelReplicaState_LoadedUnavailable, expected: true}, + {name: "Draining", state: ModelReplicaState_Draining, expected: true}, + {name: "Loading", state: ModelReplicaState_Loading, expected: false}, + {name: "Unloaded", state: ModelReplicaState_Unloaded, expected: false}, + {name: "Unloading", state: ModelReplicaState_Unloading, expected: false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.state.CanReceiveTraffic() + g.Expect(result).To(Equal(tt.expected)) + }) + } +} + +func TestModelReplicaState_AlreadyLoadingOrLoaded(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + state ModelReplicaState + expected bool + }{ + {name: "Loading", state: ModelReplicaState_Loading, expected: true}, + {name: "Loaded", state: ModelReplicaState_Loaded, expected: true}, + {name: "Available", state: ModelReplicaState_Available, expected: true}, + {name: "LoadedUnavailable", state: ModelReplicaState_LoadedUnavailable, expected: true}, + {name: "LoadRequested", state: ModelReplicaState_LoadRequested, expected: false}, + {name: "Unloaded", state: ModelReplicaState_Unloaded, expected: false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.state.AlreadyLoadingOrLoaded() + g.Expect(result).To(Equal(tt.expected)) + }) + } +} + +func TestModelReplicaState_UnloadingOrUnloaded(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + state ModelReplicaState + expected bool + }{ + {name: "UnloadEnvoyRequested", state: ModelReplicaState_UnloadEnvoyRequested, expected: true}, + {name: "UnloadRequested", state: ModelReplicaState_UnloadRequested, expected: true}, + {name: "Unloading", state: ModelReplicaState_Unloading, expected: true}, + {name: "Unloaded", state: ModelReplicaState_Unloaded, expected: true}, + {name: "ModelReplicaStateUnknown", state: ModelReplicaState_ModelReplicaStateUnknown, expected: true}, + {name: "Loaded", state: ModelReplicaState_Loaded, expected: false}, + {name: "Available", state: ModelReplicaState_Available, expected: false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.state.UnloadingOrUnloaded() + g.Expect(result).To(Equal(tt.expected)) + }) + } +} + +func TestModelReplicaState_Inactive(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + state ModelReplicaState + expected bool + }{ + {name: "Unloaded", state: ModelReplicaState_Unloaded, expected: true}, + {name: "UnloadFailed", state: ModelReplicaState_UnloadFailed, expected: true}, + {name: "ModelReplicaStateUnknown", state: ModelReplicaState_ModelReplicaStateUnknown, expected: true}, + {name: "LoadFailed", state: ModelReplicaState_LoadFailed, expected: true}, + {name: "Loaded", state: ModelReplicaState_Loaded, expected: false}, + {name: "Available", state: ModelReplicaState_Available, expected: false}, + {name: "Loading", state: ModelReplicaState_Loading, expected: false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.state.Inactive() + g.Expect(result).To(Equal(tt.expected)) + }) + } +} + +func TestModelReplicaState_IsLoadingOrLoaded(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + state ModelReplicaState + expected bool + }{ + {name: "Loaded", state: ModelReplicaState_Loaded, expected: true}, + {name: "LoadRequested", state: ModelReplicaState_LoadRequested, expected: true}, + {name: "Loading", state: ModelReplicaState_Loading, expected: true}, + {name: "Available", state: ModelReplicaState_Available, expected: true}, + {name: "LoadedUnavailable", state: ModelReplicaState_LoadedUnavailable, expected: true}, + {name: "Unloaded", state: ModelReplicaState_Unloaded, expected: false}, + {name: "Unloading", state: ModelReplicaState_Unloading, expected: false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.state.IsLoadingOrLoaded() + g.Expect(result).To(Equal(tt.expected)) + }) + } +} \ No newline at end of file diff --git a/apis/go/mlops/scheduler/db/server_ext_test.go b/apis/go/mlops/scheduler/db/server_ext_test.go new file mode 100644 index 0000000000..327228eff1 --- /dev/null +++ b/apis/go/mlops/scheduler/db/server_ext_test.go @@ -0,0 +1,578 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed BY +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + +package db + +import ( + "testing" + + . "github.com/onsi/gomega" +) + +func TestServer_AddReplica(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + server *Server + replicaID int32 + replica *ServerReplica + expectedLen int + }{ + { + name: "add replica to empty server", + server: &Server{}, + replicaID: 0, + replica: &ServerReplica{ServerName: "test-server"}, + expectedLen: 1, + }, + { + name: "add replica to server with existing replicas", + server: &Server{ + Replicas: map[int32]*ServerReplica{ + 0: {ServerName: "server1"}, + }, + }, + replicaID: 1, + replica: &ServerReplica{ServerName: "server2"}, + expectedLen: 2, + }, + { + name: "overwrite existing replica", + server: &Server{ + Replicas: map[int32]*ServerReplica{ + 0: {ServerName: "old-server"}, + }, + }, + replicaID: 0, + replica: &ServerReplica{ServerName: "new-server"}, + expectedLen: 1, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tt.server.AddReplica(tt.replicaID, tt.replica) + g.Expect(tt.server.Replicas).To(HaveLen(tt.expectedLen)) + g.Expect(tt.server.Replicas[tt.replicaID]).To(Equal(tt.replica)) + }) + } +} + +func TestServerReplica_GetLoadedOrLoadingModelVersions(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + replica *ServerReplica + expectedLen int + expectedIDs []*ModelVersionID + }{ + { + name: "both loaded and loading models", + replica: &ServerReplica{ + LoadedModels: []*ModelVersionID{ + {Name: "model1", Version: 1}, + {Name: "model2", Version: 1}, + }, + LoadingModels: []*ModelVersionID{ + {Name: "model3", Version: 1}, + }, + }, + expectedLen: 3, + expectedIDs: []*ModelVersionID{ + {Name: "model1", Version: 1}, + {Name: "model2", Version: 1}, + {Name: "model3", Version: 1}, + }, + }, + { + name: "only loaded models", + replica: &ServerReplica{ + LoadedModels: []*ModelVersionID{ + {Name: "model1", Version: 1}, + }, + LoadingModels: []*ModelVersionID{}, + }, + expectedLen: 1, + expectedIDs: []*ModelVersionID{ + {Name: "model1", Version: 1}, + }, + }, + { + name: "only loading models", + replica: &ServerReplica{ + LoadedModels: []*ModelVersionID{}, + LoadingModels: []*ModelVersionID{ + {Name: "model1", Version: 1}, + }, + }, + expectedLen: 1, + expectedIDs: []*ModelVersionID{ + {Name: "model1", Version: 1}, + }, + }, + { + name: "no models", + replica: &ServerReplica{ + LoadedModels: []*ModelVersionID{}, + LoadingModels: []*ModelVersionID{}, + }, + expectedLen: 0, + expectedIDs: []*ModelVersionID{}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.replica.GetLoadedOrLoadingModelVersions() + g.Expect(result).To(HaveLen(tt.expectedLen)) + for i, expected := range tt.expectedIDs { + g.Expect(result[i].Name).To(Equal(expected.Name)) + g.Expect(result[i].Version).To(Equal(expected.Version)) + } + }) + } +} + +func TestServerReplica_UpdateReservedMemory(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + replica *ServerReplica + memBytes uint64 + isAdd bool + expectedReserved uint64 + }{ + { + name: "add memory", + replica: &ServerReplica{ + ReservedMemory: 1000, + }, + memBytes: 500, + isAdd: true, + expectedReserved: 1500, + }, + { + name: "subtract memory", + replica: &ServerReplica{ + ReservedMemory: 1000, + }, + memBytes: 300, + isAdd: false, + expectedReserved: 700, + }, + { + name: "subtract more than reserved - clamp to 0", + replica: &ServerReplica{ + ReservedMemory: 500, + }, + memBytes: 1000, + isAdd: false, + expectedReserved: 0, + }, + { + name: "add to zero", + replica: &ServerReplica{ + ReservedMemory: 0, + }, + memBytes: 500, + isAdd: true, + expectedReserved: 500, + }, + { + name: "subtract exact amount", + replica: &ServerReplica{ + ReservedMemory: 500, + }, + memBytes: 500, + isAdd: false, + expectedReserved: 0, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tt.replica.UpdateReservedMemory(tt.memBytes, tt.isAdd) + g.Expect(tt.replica.ReservedMemory).To(Equal(tt.expectedReserved)) + }) + } +} + +func TestServerReplica_GetNumLoadedModels(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + replica *ServerReplica + expected int + }{ + { + name: "multiple unique models", + replica: &ServerReplica{ + UniqueLoadedModels: map[string]bool{ + "model1": true, + "model2": true, + "model3": true, + }, + }, + expected: 3, + }, + { + name: "single model", + replica: &ServerReplica{ + UniqueLoadedModels: map[string]bool{ + "model1": true, + }, + }, + expected: 1, + }, + { + name: "no models", + replica: &ServerReplica{ + UniqueLoadedModels: map[string]bool{}, + }, + expected: 0, + }, + { + name: "nil map", + replica: &ServerReplica{ + UniqueLoadedModels: nil, + }, + expected: 0, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := tt.replica.GetNumLoadedModels() + g.Expect(result).To(Equal(tt.expected)) + }) + } +} + +func TestServerReplica_AddModelVersion(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + replica *ServerReplica + modelName string + modelVersion uint32 + replicaState ModelReplicaState + expectedLoadingLen int + expectedLoadedLen int + expectedUniqueModels map[string]bool + }{ + { + name: "add loading model to empty replica", + replica: &ServerReplica{}, + modelName: "model1", + modelVersion: 1, + replicaState: ModelReplicaState_Loading, + expectedLoadingLen: 1, + expectedLoadedLen: 0, + expectedUniqueModels: nil, + }, + { + name: "add loaded model moves from loading to loaded", + replica: &ServerReplica{ + LoadingModels: []*ModelVersionID{ + {Name: "model1", Version: 1}, + }, + }, + modelName: "model1", + modelVersion: 1, + replicaState: ModelReplicaState_Loaded, + expectedLoadingLen: 0, + expectedLoadedLen: 1, + expectedUniqueModels: map[string]bool{ + "model1": true, + }, + }, + { + name: "add loaded model to existing loaded models", + replica: &ServerReplica{ + LoadedModels: []*ModelVersionID{ + {Name: "model1", Version: 1}, + }, + UniqueLoadedModels: map[string]bool{ + "model1": true, + }, + }, + modelName: "model2", + modelVersion: 1, + replicaState: ModelReplicaState_Loaded, + expectedLoadingLen: 0, + expectedLoadedLen: 2, + expectedUniqueModels: map[string]bool{ + "model1": true, + "model2": true, + }, + }, + { + name: "add multiple versions of same model", + replica: &ServerReplica{ + LoadedModels: []*ModelVersionID{ + {Name: "model1", Version: 1}, + }, + UniqueLoadedModels: map[string]bool{ + "model1": true, + }, + }, + modelName: "model1", + modelVersion: 2, + replicaState: ModelReplicaState_Loaded, + expectedLoadingLen: 0, + expectedLoadedLen: 2, + expectedUniqueModels: map[string]bool{ + "model1": true, + }, + }, + { + name: "add duplicate loading model - should not duplicate", + replica: &ServerReplica{ + LoadingModels: []*ModelVersionID{ + {Name: "model1", Version: 1}, + }, + }, + modelName: "model1", + modelVersion: 1, + replicaState: ModelReplicaState_Loading, + expectedLoadingLen: 1, + expectedLoadedLen: 0, + expectedUniqueModels: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tt.replica.AddModelVersion(tt.modelName, tt.modelVersion, tt.replicaState) + g.Expect(tt.replica.LoadingModels).To(HaveLen(tt.expectedLoadingLen)) + g.Expect(tt.replica.LoadedModels).To(HaveLen(tt.expectedLoadedLen)) + if tt.expectedUniqueModels != nil { + g.Expect(tt.replica.UniqueLoadedModels).To(Equal(tt.expectedUniqueModels)) + } + }) + } +} + +func TestServerReplica_DeleteModelVersion(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + replica *ServerReplica + modelName string + modelVersion uint32 + expectedLoadingLen int + expectedLoadedLen int + expectedUniqueModels map[string]bool + }{ + { + name: "delete loading model", + replica: &ServerReplica{ + LoadingModels: []*ModelVersionID{ + {Name: "model1", Version: 1}, + {Name: "model2", Version: 1}, + }, + }, + modelName: "model1", + modelVersion: 1, + expectedLoadingLen: 1, + expectedLoadedLen: 0, + expectedUniqueModels: nil, + }, + { + name: "delete loaded model", + replica: &ServerReplica{ + LoadedModels: []*ModelVersionID{ + {Name: "model1", Version: 1}, + {Name: "model2", Version: 1}, + }, + UniqueLoadedModels: map[string]bool{ + "model1": true, + "model2": true, + }, + }, + modelName: "model1", + modelVersion: 1, + expectedLoadingLen: 0, + expectedLoadedLen: 1, + expectedUniqueModels: map[string]bool{ + "model2": true, + }, + }, + { + name: "delete last version of model removes from unique", + replica: &ServerReplica{ + LoadedModels: []*ModelVersionID{ + {Name: "model1", Version: 1}, + }, + UniqueLoadedModels: map[string]bool{ + "model1": true, + }, + }, + modelName: "model1", + modelVersion: 1, + expectedLoadingLen: 0, + expectedLoadedLen: 0, + expectedUniqueModels: map[string]bool{}, + }, + { + name: "delete one version keeps model in unique if other versions exist", + replica: &ServerReplica{ + LoadedModels: []*ModelVersionID{ + {Name: "model1", Version: 1}, + {Name: "model1", Version: 2}, + }, + UniqueLoadedModels: map[string]bool{ + "model1": true, + }, + }, + modelName: "model1", + modelVersion: 1, + expectedLoadingLen: 0, + expectedLoadedLen: 1, + expectedUniqueModels: map[string]bool{ + "model1": true, + }, + }, + { + name: "delete from both loading and loaded", + replica: &ServerReplica{ + LoadingModels: []*ModelVersionID{ + {Name: "model1", Version: 1}, + }, + LoadedModels: []*ModelVersionID{ + {Name: "model1", Version: 1}, + }, + UniqueLoadedModels: map[string]bool{ + "model1": true, + }, + }, + modelName: "model1", + modelVersion: 1, + expectedLoadingLen: 0, + expectedLoadedLen: 0, + expectedUniqueModels: map[string]bool{}, + }, + { + name: "delete non-existent model", + replica: &ServerReplica{ + LoadedModels: []*ModelVersionID{ + {Name: "model1", Version: 1}, + }, + UniqueLoadedModels: map[string]bool{ + "model1": true, + }, + }, + modelName: "model2", + modelVersion: 1, + expectedLoadingLen: 0, + expectedLoadedLen: 1, + expectedUniqueModels: map[string]bool{ + "model1": true, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tt.replica.DeleteModelVersion(tt.modelName, tt.modelVersion) + g.Expect(tt.replica.LoadingModels).To(HaveLen(tt.expectedLoadingLen)) + g.Expect(tt.replica.LoadedModels).To(HaveLen(tt.expectedLoadedLen)) + if tt.expectedUniqueModels != nil { + g.Expect(tt.replica.UniqueLoadedModels).To(Equal(tt.expectedUniqueModels)) + } + }) + } +} + +func TestServerReplica_AddModelVersion_Integration(t *testing.T) { + g := NewWithT(t) + + t.Run("full lifecycle: loading -> loaded -> delete", func(t *testing.T) { + replica := &ServerReplica{} + + // Add model as loading + replica.AddModelVersion("model1", 1, ModelReplicaState_Loading) + g.Expect(replica.LoadingModels).To(HaveLen(1)) + g.Expect(replica.LoadedModels).To(HaveLen(0)) + g.Expect(replica.GetNumLoadedModels()).To(Equal(0)) + + // Move to loaded + replica.AddModelVersion("model1", 1, ModelReplicaState_Loaded) + g.Expect(replica.LoadingModels).To(HaveLen(0)) + g.Expect(replica.LoadedModels).To(HaveLen(1)) + g.Expect(replica.GetNumLoadedModels()).To(Equal(1)) + + // Verify it's in loaded models list + allModels := replica.GetLoadedOrLoadingModelVersions() + g.Expect(allModels).To(HaveLen(1)) + g.Expect(allModels[0].Name).To(Equal("model1")) + g.Expect(allModels[0].Version).To(Equal(uint32(1))) + + // Delete model + replica.DeleteModelVersion("model1", 1) + g.Expect(replica.LoadedModels).To(HaveLen(0)) + g.Expect(replica.GetNumLoadedModels()).To(Equal(0)) + }) + + t.Run("multiple models", func(t *testing.T) { + replica := &ServerReplica{} + + // Add three models + replica.AddModelVersion("model1", 1, ModelReplicaState_Loaded) + replica.AddModelVersion("model2", 1, ModelReplicaState_Loaded) + replica.AddModelVersion("model3", 1, ModelReplicaState_Loading) + + g.Expect(replica.LoadedModels).To(HaveLen(2)) + g.Expect(replica.LoadingModels).To(HaveLen(1)) + g.Expect(replica.GetNumLoadedModels()).To(Equal(2)) + + allModels := replica.GetLoadedOrLoadingModelVersions() + g.Expect(allModels).To(HaveLen(3)) + + // Delete one loaded model + replica.DeleteModelVersion("model1", 1) + g.Expect(replica.GetNumLoadedModels()).To(Equal(1)) + g.Expect(replica.UniqueLoadedModels).To(HaveKey("model2")) + g.Expect(replica.UniqueLoadedModels).NotTo(HaveKey("model1")) + }) +} + +func TestServerReplica_UpdateReservedMemory_Integration(t *testing.T) { + g := NewWithT(t) + + t.Run("reserve and release memory", func(t *testing.T) { + replica := &ServerReplica{ + Memory: 10000, + ReservedMemory: 0, + AvailableMemory: 10000, + } + + // Reserve memory for model 1 + replica.UpdateReservedMemory(3000, true) + g.Expect(replica.ReservedMemory).To(Equal(uint64(3000))) + + // Reserve memory for model 2 + replica.UpdateReservedMemory(2000, true) + g.Expect(replica.ReservedMemory).To(Equal(uint64(5000))) + + // Release model 1 + replica.UpdateReservedMemory(3000, false) + g.Expect(replica.ReservedMemory).To(Equal(uint64(2000))) + + // Release model 2 + replica.UpdateReservedMemory(2000, false) + g.Expect(replica.ReservedMemory).To(Equal(uint64(0))) + }) +} \ No newline at end of file diff --git a/hodometer/go.mod b/hodometer/go.mod index a205724a2d..2073f18bc5 100644 --- a/hodometer/go.mod +++ b/hodometer/go.mod @@ -31,15 +31,12 @@ require ( github.com/go-openapi/jsonreference v0.21.0 // indirect github.com/go-openapi/swag v0.23.1 // indirect github.com/gogo/protobuf v1.3.2 // indirect - github.com/golang/protobuf v1.5.4 // indirect github.com/google/btree v1.1.3 // indirect github.com/google/gnostic-models v0.7.0 // indirect github.com/google/go-cmp v0.7.0 // indirect - github.com/google/gofuzz v1.2.0 // indirect github.com/hashicorp/go-cleanhttp v0.5.2 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect - github.com/klauspost/compress v1.18.0 // indirect github.com/mailru/easyjson v0.9.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect @@ -54,16 +51,16 @@ require ( github.com/x448/float16 v0.8.4 // indirect go.yaml.in/yaml/v2 v2.4.2 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect - golang.org/x/net v0.41.0 // indirect + golang.org/x/net v0.43.0 // indirect golang.org/x/oauth2 v0.30.0 // indirect - golang.org/x/sync v0.15.0 // indirect - golang.org/x/sys v0.33.0 // indirect - golang.org/x/term v0.32.0 // indirect - golang.org/x/text v0.26.0 // indirect + golang.org/x/sync v0.16.0 // indirect + golang.org/x/sys v0.35.0 // indirect + golang.org/x/term v0.34.0 // indirect + golang.org/x/text v0.28.0 // indirect golang.org/x/time v0.12.0 // indirect gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20250603155806-513f23925822 // indirect - google.golang.org/protobuf v1.36.6 // indirect + google.golang.org/protobuf v1.36.7 // indirect gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/hodometer/go.sum b/hodometer/go.sum index 4b21d83e56..056c35deda 100644 --- a/hodometer/go.sum +++ b/hodometer/go.sum @@ -3,6 +3,8 @@ github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03 github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= +github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= @@ -14,8 +16,6 @@ github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/dukex/mixpanel v1.0.1 h1:IQ3qBjtgltF044jU9+i6MubdDdpc8PKpK9yvfawRgeE= github.com/dukex/mixpanel v1.0.1/go.mod h1:080BDsRRMzAxViWT3OjlQaMW9nhaIEXDHHtGeDK60b8= -github.com/emicklei/go-restful/v3 v3.12.1 h1:PJMDIM/ak7btuL8Ex0iYET9hxM3CI2sjZtzpL63nKAU= -github.com/emicklei/go-restful/v3 v3.12.1/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= github.com/emicklei/go-restful/v3 v3.12.2 h1:DhwDP0vY3k8ZzE0RunuJy8GhNpPL6zqLkDf9B/a0/xU= github.com/emicklei/go-restful/v3 v3.12.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= @@ -28,32 +28,22 @@ github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjT github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM= github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM= github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE= -github.com/fsnotify/fsnotify v1.8.0 h1:dAwr6QBTBZIkG8roQaJjGof0pp0EeF+tNV7YBP3F/8M= -github.com/fsnotify/fsnotify v1.8.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= -github.com/fxamacker/cbor/v2 v2.7.0 h1:iM5WgngdRBanHcxugY4JySA0nk1wZorNOpTgCMedv5E= -github.com/fxamacker/cbor/v2 v2.7.0/go.mod h1:pxXPTn3joSm21Gbwsv0w9OSA2y1HFR9qXEeXQVeNoDQ= github.com/fxamacker/cbor/v2 v2.8.0 h1:fFtUGXUzXPHTIUdne5+zzMPTfffl3RD5qYnkY40vtxU= github.com/fxamacker/cbor/v2 v2.8.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= github.com/go-kit/log v0.1.0/go.mod h1:zbhenjAZHb184qTLMA9ZjW7ThYL0H2mk7Q6pNt4vbaY= github.com/go-logfmt/logfmt v0.5.0/go.mod h1:wCYkCAKZfumFQihp8CzCvQ3paCTfi41vtzG1KdI/P7A= -github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= -github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg= -github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= -github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= github.com/go-openapi/jsonpointer v0.21.1 h1:whnzv/pNXtK2FbX/W9yJfRmE2gsmkfahjMKB0fZvcic= github.com/go-openapi/jsonpointer v0.21.1/go.mod h1:50I1STOfbY1ycR8jGz8DaMeLCdXiI6aDteEdRNNzpdk= github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ= github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4= -github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= -github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= github.com/go-openapi/swag v0.23.1 h1:lpsStH0n2ittzTnbaSloVZLuB5+fvSY/+hnagBjSNZU= github.com/go-openapi/swag v0.23.1/go.mod h1:STZs8TbRvEQQKUA+JZNAm3EWlgaOBGpyFDqQnDHMef0= github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= @@ -70,14 +60,10 @@ github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= -github.com/google/gnostic-models v0.6.9 h1:MU/8wDLif2qCXZmzncUQ/BOfxWfthHi63KqpoNbWqVw= -github.com/google/gnostic-models v0.6.9/go.mod h1:CiWsm0s6BSQd1hRn8/QmxqB6BesYcbSZxsz9b0KuDBw= github.com/google/gnostic-models v0.7.0 h1:qwTtogB15McXDaNqTZdzPJRHvaVJlAl+HVQnLmJEJxo= github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= -github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= -github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= @@ -93,8 +79,6 @@ github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9n github.com/hashicorp/go-cleanhttp v0.5.2/go.mod h1:kO/YDlP8L1346E6Sodw+PrpBSV4/SoxCXGY6BqNFT48= github.com/hashicorp/go-hclog v1.6.3 h1:Qr2kF+eVWjTiYmU7Y31tYlP1h0q/X3Nl3tPGdaB11/k= github.com/hashicorp/go-hclog v1.6.3/go.mod h1:W4Qnvbt70Wk/zYJryRzDRU/4r0kIg0PVHBcfoyhpF5M= -github.com/hashicorp/go-retryablehttp v0.7.7 h1:C8hUCYzor8PIfXHa4UrZkU4VvK8o9ISHxT2Q8+VepXU= -github.com/hashicorp/go-retryablehttp v0.7.7/go.mod h1:pkQpWZeYWskR+D1tR2O5OcBFOxfA7DoAO6xtkuQnHTk= github.com/hashicorp/go-retryablehttp v0.7.8 h1:ylXZWnqa7Lhqpk0L1P1LzDtGcCR0rPVUrx/c8Unxc48= github.com/hashicorp/go-retryablehttp v0.7.8/go.mod h1:rjiScheydd+CxvumBsIrFKlx3iS0jrZ7LvzFGFmuKbw= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= @@ -103,8 +87,7 @@ github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnr github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= -github.com/klauspost/compress v1.17.11 h1:In6xLpyWOi1+C7tXUUWv2ot1QvBjxevKAaI6IXrJmUc= -github.com/klauspost/compress v1.17.11/go.mod h1:pMDklpSncoRMuLFrf1W9Ss9KT+0rH90U12bZKk7uwG0= +github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= @@ -129,10 +112,10 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= -github.com/onsi/ginkgo/v2 v2.21.0 h1:7rg/4f3rB88pb5obDgNZrNHrQ4e6WpjonchcpuBRnZM= -github.com/onsi/ginkgo/v2 v2.21.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C+NkhLQo= -github.com/onsi/gomega v1.36.2 h1:koNYke6TVk6ZmnyHrCXba/T/MoLBXFjeC1PtvYgw0A8= -github.com/onsi/gomega v1.36.2/go.mod h1:DdwyADRjrc825LhMEkD76cHR5+pUnjhUN8GlHlRPHzY= +github.com/onsi/ginkgo/v2 v2.22.0 h1:Yed107/8DjTr0lKCNt7Dn8yQ6ybuDRQoMGrNFKzMfHg= +github.com/onsi/ginkgo/v2 v2.22.0/go.mod h1:7Du3c42kxCUegi0IImZ1wUQzMBVecgIHjR1C+NkhLQo= +github.com/onsi/gomega v1.39.0 h1:y2ROC3hKFmQZJNFeGAMeHZKkjBL65mIZcvrLQBF9k6Q= +github.com/onsi/gomega v1.39.0/go.mod h1:ZCU1pkQcXDO5Sl9/VVEGlDyp+zm0m1cmeG5TOzLgdh4= github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= github.com/otiai10/copy v1.14.1 h1:5/7E6qsUMBaH5AnQ0sSLzzTg1oTECmcCmT6lvF45Na8= github.com/otiai10/copy v1.14.1/go.mod h1:oQwrEDDOci3IM8dJF0d8+jnbfPDllW6vUjNc3DoZm9I= @@ -144,27 +127,17 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/prometheus/client_golang v1.20.5 h1:cxppBPuYhUnsO6yo/aoRol4L7q7UFfdm+bR9r+8l63Y= -github.com/prometheus/client_golang v1.20.5/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q= github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= -github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= -github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= -github.com/prometheus/common v0.62.0 h1:xasJaQlnWAeyHdUBeGjXmutelfJHWMRr+Fg4QszZ2Io= -github.com/prometheus/common v0.62.0/go.mod h1:vyBcEuLSvWos9B1+CyL7JZ2up+uFzXhkqml0W5zIY1I= github.com/prometheus/common v0.65.0 h1:QDwzd+G1twt//Kwj/Ww6E9FQq1iVMmODnILtW1t2VzE= github.com/prometheus/common v0.65.0/go.mod h1:0gZns+BLRQ3V6NdaerOhMbwwRbNh9hkGINtQAsP5GS8= -github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= -github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= -github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= -github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= github.com/prometheus/procfs v0.17.0 h1:FuLQ+05u4ZI+SS/w9+BWEM2TXiHKsUQ9TADiRH7DuK0= github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUOVhe0wYB2zw= -github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= -github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= +github.com/rogpeppe/go-internal v1.13.1 h1:KvO1DLK/DRN07sQ1LQKScxyZJuNnedQ5/wKSR38lUII= +github.com/rogpeppe/go-internal v1.13.1/go.mod h1:uMEvuHeurkdAXX61udpOXGD/AzZDWNMNyH2VO9fmH0o= github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= @@ -172,6 +145,8 @@ github.com/spf13/pflag v1.0.6 h1:jFzHGLGAlb3ruxLB8MhbI6A8+AQX/2eW4qeyNZXNp2o= github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= @@ -182,16 +157,18 @@ github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= -go.opentelemetry.io/otel v1.32.0 h1:WnBN+Xjcteh0zdk01SVqV55d/m62NJLJdIyb4y/WO5U= -go.opentelemetry.io/otel v1.32.0/go.mod h1:00DCVSB0RQcnzlwyTfqtxSm+DRr9hpYrHjNGiBHVQIg= -go.opentelemetry.io/otel/metric v1.32.0 h1:xV2umtmNcThh2/a/aCP+h64Xx5wsj8qqnkYZktzNa0M= -go.opentelemetry.io/otel/metric v1.32.0/go.mod h1:jH7CIbbK6SH2V2wE16W05BHCtIDzauciCRLoc/SyMv8= -go.opentelemetry.io/otel/sdk v1.32.0 h1:RNxepc9vK59A8XsgZQouW8ue8Gkb4jpWtJm9ge5lEG4= -go.opentelemetry.io/otel/sdk v1.32.0/go.mod h1:LqgegDBjKMmb2GC6/PrTnteJG39I8/vJCAP9LlJXEjU= -go.opentelemetry.io/otel/sdk/metric v1.32.0 h1:rZvFnvmvawYb0alrYkjraqJq0Z4ZUJAiyYCU9snn1CU= -go.opentelemetry.io/otel/sdk/metric v1.32.0/go.mod h1:PWeZlq0zt9YkYAp3gjKZ0eicRYvOh1Gd+X99x6GHpCQ= -go.opentelemetry.io/otel/trace v1.32.0 h1:WIC9mYrXf8TmY/EXuULKc8hR17vE+Hjv2cssQDe03fM= -go.opentelemetry.io/otel/trace v1.32.0/go.mod h1:+i4rkvCraA+tG6AzwloGaCtkx53Fa+L+V8e9a7YvhT8= +go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= +go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= +go.opentelemetry.io/otel v1.35.0 h1:xKWKPxrxB6OtMCbmMY021CqC45J+3Onta9MqjhnusiQ= +go.opentelemetry.io/otel v1.35.0/go.mod h1:UEqy8Zp11hpkUrL73gSlELM0DupHoiq72dR+Zqel/+Y= +go.opentelemetry.io/otel/metric v1.35.0 h1:0znxYu2SNyuMSQT4Y9WDWej0VpcsxkuklLa4/siN90M= +go.opentelemetry.io/otel/metric v1.35.0/go.mod h1:nKVFgxBZ2fReX6IlyW28MgZojkoAkJGaE8CpgeAU3oE= +go.opentelemetry.io/otel/sdk v1.35.0 h1:iPctf8iprVySXSKJffSS79eOjl9pvxV9ZqOWT0QejKY= +go.opentelemetry.io/otel/sdk v1.35.0/go.mod h1:+ga1bZliga3DxJ3CQGg3updiaAJoNECOgJREo9KHGQg= +go.opentelemetry.io/otel/sdk/metric v1.35.0 h1:1RriWBmCKgkeHEhM7a2uMjMUfP7MsOF5JpUCaEqEI9o= +go.opentelemetry.io/otel/sdk/metric v1.35.0/go.mod h1:is6XYCUMpcKi+ZsOvfluY5YstFnhW0BidkR+gL+qN+w= +go.opentelemetry.io/otel/trace v1.35.0 h1:dPpEfJu1sDIqruz7BHFG3c7528f6ddfSWfFDVt/xgMs= +go.opentelemetry.io/otel/trace v1.35.0/go.mod h1:WUk7DtFp1Aw2MkvqGdwiXYDZZNvA/1J8o6xRXLrIkyc= go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= @@ -204,8 +181,6 @@ go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= go.yaml.in/yaml/v2 v2.4.2 h1:DzmwEr2rDGHl7lsFgAHxmNz/1NlQ7xLIrlN2h5d1eGI= go.yaml.in/yaml/v2 v2.4.2/go.mod h1:081UH+NErpNdqlCXm3TtEran0rJZGxAYx9hb/ELlsPU= -go.yaml.in/yaml/v3 v3.0.3 h1:bXOww4E/J3f66rav3pX3m8w6jDE4knZjGOw8b5Y6iNE= -go.yaml.in/yaml/v3 v3.0.3/go.mod h1:tBHosrYAkRZjRAOREWbDnBXUf08JOwYq++0QNwQiWzI= go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= @@ -226,13 +201,9 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.34.0 h1:Mb7Mrk043xzHgnRM88suvJFwzVrRfHEHJEl5/71CKw0= -golang.org/x/net v0.34.0/go.mod h1:di0qlW3YNM5oh6GqDGQr92MyTozJPmybPK4Ev/Gm31k= -golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw= -golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA= +golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= +golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= -golang.org/x/oauth2 v0.25.0 h1:CY4y7XT9v0cRI9oupztF8AgiIu99L/ksR/Xp/6jrZ70= -golang.org/x/oauth2 v0.25.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -240,10 +211,8 @@ golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ= -golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= -golang.org/x/sync v0.15.0 h1:KWH3jNZsfyT6xfAfKiz6MRNmd46ByHDYaZ7KSkCtdW8= -golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= +golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -251,22 +220,14 @@ golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20211025201205-69cdffdb9359/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.29.0 h1:TPYlXGxvx1MGTn2GiZDhnjPA9wZzZeGKHHmKhHYvgaU= -golang.org/x/sys v0.29.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= -golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/term v0.28.0 h1:/Ts8HFuMR2E6IP/jlo7QVLZHggjKQbhu/7H0LJFr3Gg= -golang.org/x/term v0.28.0/go.mod h1:Sw/lC2IAUZ92udQNf3WodGtn4k/XoLyZoh8v/8uiwek= -golang.org/x/term v0.32.0 h1:DR4lr0TjUs3epypdhTOkMmuF5CDFJ/8pOnbzMZPQ7bg= -golang.org/x/term v0.32.0/go.mod h1:uZG1FhGx848Sqfsq4/DlJr3xGGsYMu/L5GW4abiaEPQ= +golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= +golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/term v0.34.0 h1:O/2T7POpk0ZZ7MAzMeWFSg6S5IpWd/RXDlM9hgM3DR4= +golang.org/x/term v0.34.0/go.mod h1:5jC53AEywhIVebHgPVeg0mj8OD3VO9OzclacVrqpaAw= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= -golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= -golang.org/x/text v0.26.0 h1:P42AVeLghgTYr4+xUnTRKDMqpar+PtX7KWuNQL21L8M= -golang.org/x/text v0.26.0/go.mod h1:QK15LZJUUQVJxhz7wXgxSy/CJaTFjd0G+YLonydOVQA= -golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY= -golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= +golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng= +golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU= golang.org/x/time v0.12.0 h1:ScB/8o8olJvc+CQPWrK3fPZNfh7qgwCrY0zJmoEQLSE= golang.org/x/time v0.12.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= @@ -278,14 +239,12 @@ golang.org/x/tools v0.0.0-20191108193012-7d206e10da11/go.mod h1:b+2E5dAYhXwXZwtn golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.26.0 h1:v/60pFQmzmT9ExmjDv2gGIfi3OqfKoEP6I5+umXlbnQ= -golang.org/x/tools v0.26.0/go.mod h1:TPVVj70c7JJ3WCazhD8OdXcZg/og+b9+tH/KxylGwH0= +golang.org/x/tools v0.35.0 h1:mBffYraMEf7aa0sB+NuKnuCy8qI/9Bughn8dC2Gu5r0= +golang.org/x/tools v0.35.0/go.mod h1:NKdj5HkL/73byiZSJjqJgKn3ep7KjFkBOkR/Hps3VPw= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw= -gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= gomodules.xyz/jsonpatch/v2 v2.5.0 h1:JELs8RLM12qJGXU4u/TO3V25KW8GreMKl9pdkk14RM0= gomodules.xyz/jsonpatch/v2 v2.5.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= @@ -293,8 +252,6 @@ google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7 google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= google.golang.org/genproto v0.0.0-20200423170343-7949de9c1215/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250127172529-29210b9bc287 h1:J1H9f+LEdWAfHcez/4cvaVBox7cOYT+IU6rgqj5x++8= -google.golang.org/genproto/googleapis/rpc v0.0.0-20250127172529-29210b9bc287/go.mod h1:8BS3B93F/U1juMFq9+EDk+qOT5CO1R9IzXxG3PTqiRk= google.golang.org/genproto/googleapis/rpc v0.0.0-20250603155806-513f23925822 h1:fc6jSaCT0vBduLYZHYrBBNY4dsWuvgyff9noRNDdBeE= google.golang.org/genproto/googleapis/rpc v0.0.0-20250603155806-513f23925822/go.mod h1:qQ0YXyHHx3XkvlzUtpXDkS29lDSafHMZBAZDc03LQ3A= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= @@ -302,14 +259,10 @@ google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyac google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= google.golang.org/grpc v1.29.1/go.mod h1:itym6AZVZYACWQqET3MqgPpjcuV5QH3BxFS3IjizoKk= -google.golang.org/grpc v1.70.0 h1:pWFv03aZoHzlRKHWicjsZytKAiYCtNS0dHbXnIdq7jQ= -google.golang.org/grpc v1.70.0/go.mod h1:ofIJqVKDXx/JiXrwr2IG4/zwdH9txy3IlF40RmcJSQw= google.golang.org/grpc v1.73.0 h1:VIWSmpI2MegBtTuFt5/JWy2oXxtjJ/e89Z70ImfD2ok= google.golang.org/grpc v1.73.0/go.mod h1:50sbHOUqWoCQGI8V2HQLJM0B+LMlIUjNSZmow7EVBQc= -google.golang.org/protobuf v1.36.4 h1:6A3ZDJHn/eNqc1i+IdefRzy/9PokBTPvcqMySR7NNIM= -google.golang.org/protobuf v1.36.4/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= -google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= -google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +google.golang.org/protobuf v1.36.7 h1:IgrO7UwFQGJdRNXH/sQux4R1Dj1WAKcLElzeeRaXV2A= +google.golang.org/protobuf v1.36.7/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= @@ -326,40 +279,22 @@ gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= -istio.io/pkg v0.0.0-20241216214326-d70796207df3 h1:RVGgJ/Bipm3ekfiiFzkaorjjNzJH8lpVU9SlGkRDg6w= -istio.io/pkg v0.0.0-20241216214326-d70796207df3/go.mod h1:vBLH0h+N4sU4Du8yjhaMPqbzyM0i7Q2lhm/v89qeows= istio.io/pkg v0.0.0-20250424180003-5bd6dc4b200f h1:3ihT3jF1MqjAl2MM0GezdgHryb7+uedN+s7Yv+J6tPI= istio.io/pkg v0.0.0-20250424180003-5bd6dc4b200f/go.mod h1:qLQ76l6863ztMotYCZqf+i2qNzEP9UX4xpA2dCvsb9I= -k8s.io/api v0.32.1 h1:f562zw9cy+GvXzXf0CKlVQ7yHJVYzLfL6JAS4kOAaOc= -k8s.io/api v0.32.1/go.mod h1:/Yi/BqkuueW1BgpoePYBRdDYfjPF5sgTr5+YqDZra5k= k8s.io/api v0.33.2 h1:YgwIS5jKfA+BZg//OQhkJNIfie/kmRsO0BmNaVSimvY= k8s.io/api v0.33.2/go.mod h1:fhrbphQJSM2cXzCWgqU29xLDuks4mu7ti9vveEnpSXs= -k8s.io/apiextensions-apiserver v0.32.1 h1:hjkALhRUeCariC8DiVmb5jj0VjIc1N0DREP32+6UXZw= -k8s.io/apiextensions-apiserver v0.32.1/go.mod h1:sxWIGuGiYov7Io1fAS2X06NjMIk5CbRHc2StSmbaQto= k8s.io/apiextensions-apiserver v0.33.2 h1:6gnkIbngnaUflR3XwE1mCefN3YS8yTD631JXQhsU6M8= k8s.io/apiextensions-apiserver v0.33.2/go.mod h1:IvVanieYsEHJImTKXGP6XCOjTwv2LUMos0YWc9O+QP8= -k8s.io/apimachinery v0.32.1 h1:683ENpaCBjma4CYqsmZyhEzrGz6cjn1MY/X2jB2hkZs= -k8s.io/apimachinery v0.32.1/go.mod h1:GpHVgxoKlTxClKcteaeuF1Ul/lDVb74KpZcxcmLDElE= k8s.io/apimachinery v0.33.2 h1:IHFVhqg59mb8PJWTLi8m1mAoepkUNYmptHsV+Z1m5jY= k8s.io/apimachinery v0.33.2/go.mod h1:BHW0YOu7n22fFv/JkYOEfkUYNRN0fj0BlvMFWA7b+SM= -k8s.io/client-go v0.32.1 h1:otM0AxdhdBIaQh7l1Q0jQpmo7WOFIk5FFa4bg6YMdUU= -k8s.io/client-go v0.32.1/go.mod h1:aTTKZY7MdxUaJ/KiUs8D+GssR9zJZi77ZqtzcGXIiDg= k8s.io/client-go v0.33.2 h1:z8CIcc0P581x/J1ZYf4CNzRKxRvQAwoAolYPbtQes+E= k8s.io/client-go v0.33.2/go.mod h1:9mCgT4wROvL948w6f6ArJNb7yQd7QsvqavDeZHvNmHo= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kube-openapi v0.0.0-20241212222426-2c72e554b1e7 h1:hcha5B1kVACrLujCKLbr8XWMxCxzQx42DY8QKYJrDLg= -k8s.io/kube-openapi v0.0.0-20241212222426-2c72e554b1e7/go.mod h1:GewRfANuJ70iYzvn+i4lezLDAFzvjxZYK1gn1lWcfas= -k8s.io/kube-openapi v0.0.0-20250626183228-af0a60a813f8 h1:BbZLZQv9i6ROWiPa5sX3LGf3Sc0Ir7dtHwG2RXGmvRs= -k8s.io/kube-openapi v0.0.0-20250626183228-af0a60a813f8/go.mod h1:4ZFs+FSGR/oEzT6/XaRojsx96IHMbqK7wOGt+fcEuRk= k8s.io/kube-openapi v0.0.0-20250701173324-9bd5c66d9911 h1:gAXU86Fmbr/ktY17lkHwSjw5aoThQvhnstGGIYKlKYc= k8s.io/kube-openapi v0.0.0-20250701173324-9bd5c66d9911/go.mod h1:GLOk5B+hDbRROvt0X2+hqX64v/zO3vXN7J78OUmBSKw= -k8s.io/utils v0.0.0-20241210054802-24370beab758 h1:sdbE21q2nlQtFh65saZY+rRM6x6aJJI8IUa1AmH/qa0= -k8s.io/utils v0.0.0-20241210054802-24370beab758/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 h1:hwvWFiBzdWw1FhfY1FooPn3kzWuJ8tmbZBHi4zVsl1Y= k8s.io/utils v0.0.0-20250604170112-4c0f3b243397/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= -sigs.k8s.io/controller-runtime v0.20.1 h1:JbGMAG/X94NeM3xvjenVUaBjy6Ui4Ogd/J5ZtjZnHaE= -sigs.k8s.io/controller-runtime v0.20.1/go.mod h1:BrP3w158MwvB3ZbNpaAcIKkHQ7YGpYnzpoSTZ8E14WU= sigs.k8s.io/controller-runtime v0.21.0 h1:CYfjpEuicjUecRk+KAeyYh+ouUBn4llGyDYytIGcJS8= sigs.k8s.io/controller-runtime v0.21.0/go.mod h1:OSg14+F65eWqIu4DceX7k/+QRAbTTvxeQSNSOQpukWM= sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 h1:gBQPwqORJ8d8/YNZWEjoZs7npUVDpVXUUOFfW6CgAqE= @@ -367,11 +302,8 @@ sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8/go.mod h1:mdzfpAEoE6DHQEN0uh sigs.k8s.io/randfill v0.0.0-20250304075658-069ef1bbf016/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= -sigs.k8s.io/structured-merge-diff/v4 v4.5.0 h1:nbCitCK2hfnhyiKo6uf2HxUPTCodY6Qaf85SbDIaMBk= -sigs.k8s.io/structured-merge-diff/v4 v4.5.0/go.mod h1:N8f93tFZh9U6vpxwRArLiikrE5/2tiu1w1AGfACIGE4= sigs.k8s.io/structured-merge-diff/v4 v4.7.0 h1:qPeWmscJcXP0snki5IYF79Z8xrl8ETFxgMd7wez1XkI= sigs.k8s.io/structured-merge-diff/v4 v4.7.0/go.mod h1:dDy58f92j70zLsuZVuUX5Wp9vtxXpaZnkPGWeqDfCps= -sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= sigs.k8s.io/yaml v1.5.0 h1:M10b2U7aEUY6hRtU870n2VTPgR5RZiL/I6Lcc2F4NUQ= sigs.k8s.io/yaml v1.5.0/go.mod h1:wZs27Rbxoai4C0f8/9urLZtZtF3avA3gKvGyPdDqTO4= diff --git a/operator/go.mod b/operator/go.mod index c5a6630f5a..7bae795727 100644 --- a/operator/go.mod +++ b/operator/go.mod @@ -15,7 +15,7 @@ require ( github.com/grpc-ecosystem/go-grpc-middleware v1.4.0 github.com/json-iterator/go v1.1.12 github.com/onsi/ginkgo v1.16.5 - github.com/onsi/gomega v1.36.2 + github.com/onsi/gomega v1.39.0 github.com/seldonio/seldon-core/apis/go/v2 v2.9.1 github.com/seldonio/seldon-core/components/kafka/v2 v2.9.1 github.com/seldonio/seldon-core/components/tls/v2 v2.9.1 @@ -25,7 +25,7 @@ require ( go.uber.org/mock v0.4.0 golang.org/x/exp v0.0.0-20250128182459-e0ece0dbea4c google.golang.org/grpc v1.73.0 - google.golang.org/protobuf v1.36.6 + google.golang.org/protobuf v1.36.7 k8s.io/api v0.33.2 k8s.io/apimachinery v0.33.2 k8s.io/client-go v0.33.2 @@ -45,9 +45,9 @@ require ( github.com/x448/float16 v0.8.4 // indirect go.yaml.in/yaml/v2 v2.4.2 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect - golang.org/x/mod v0.25.0 // indirect - golang.org/x/sync v0.15.0 // indirect - golang.org/x/tools v0.33.0 // indirect + golang.org/x/mod v0.27.0 // indirect + golang.org/x/sync v0.16.0 // indirect + golang.org/x/tools v0.36.0 // indirect gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect k8s.io/code-generator v0.33.2 // indirect k8s.io/gengo/v2 v2.0.0-20250207200755-1244d31929d7 // indirect @@ -86,11 +86,11 @@ require ( github.com/sirupsen/logrus v1.9.3 // indirect go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.27.0 - golang.org/x/net v0.41.0 // indirect + golang.org/x/net v0.43.0 // indirect golang.org/x/oauth2 v0.30.0 // indirect - golang.org/x/sys v0.33.0 // indirect - golang.org/x/term v0.32.0 // indirect - golang.org/x/text v0.26.0 // indirect + golang.org/x/sys v0.35.0 // indirect + golang.org/x/term v0.34.0 // indirect + golang.org/x/text v0.28.0 // indirect golang.org/x/time v0.12.0 // indirect gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20250603155806-513f23925822 // indirect diff --git a/operator/go.sum b/operator/go.sum index 821c34f319..de1b564e5e 100644 --- a/operator/go.sum +++ b/operator/go.sum @@ -11,8 +11,8 @@ github.com/AlecAivazis/survey/v2 v2.3.7/go.mod h1:xUTIdE4KCOIjsBAE1JYsUPoCqYdZ1r github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25UVaW/CKtUDjefjrs0SPonmDGUVOYP0= github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= -github.com/Masterminds/semver/v3 v3.2.1 h1:RN9w6+7QoMeJVGyfmbcgs28Br8cvmnucEXnY0rYXWg0= -github.com/Masterminds/semver/v3 v3.2.1/go.mod h1:qvl/7zhW3nngYb5+80sSMF+FG2BjYrf8m9wsX0PNOMQ= +github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= +github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= github.com/Microsoft/hcsshim v0.11.5 h1:haEcLNpj9Ka1gd3B3tAEs9CpE0c+1IhoL59w/exYU38= @@ -213,8 +213,8 @@ github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/ github.com/google/gofuzz v1.1.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad h1:a6HEuzUHeKH6hwfN/ZoQgRgVIWFJljSWa/zetS2WTvg= -github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144= +github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8= +github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 h1:El6M4kTTCOh6aBiKaUGG7oYTSPP8MxqL4YI3kZKwcP4= github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510/go.mod h1:pupxD2MaaD3pAXIBCelhxNneeOaAeabZDe5s4K6zSpQ= github.com/google/uuid v1.1.1/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= @@ -339,14 +339,14 @@ github.com/onsi/ginkgo v1.11.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+ github.com/onsi/ginkgo v1.12.1/go.mod h1:zj2OWP4+oCPe1qIXoGWkgMRwljMUYCdkwsT2108oapk= github.com/onsi/ginkgo v1.16.5 h1:8xi0RTUf59SOSfEtZMvwTvXYMzG4gV23XVHOZiXNtnE= github.com/onsi/ginkgo v1.16.5/go.mod h1:+E8gABHa3K6zRBolWtd+ROzc/U5bkGt0FwiG042wbpU= -github.com/onsi/ginkgo/v2 v2.22.1 h1:QW7tbJAUDyVDVOM5dFa7qaybo+CRfR7bemlQUN6Z8aM= -github.com/onsi/ginkgo/v2 v2.22.1/go.mod h1:S6aTpoRsSq2cZOd+pssHAlKW/Q/jZt6cPrPlnj4a1xM= +github.com/onsi/ginkgo/v2 v2.25.3 h1:Ty8+Yi/ayDAGtk4XxmmfUy4GabvM+MegeB4cDLRi6nw= +github.com/onsi/ginkgo/v2 v2.25.3/go.mod h1:43uiyQC4Ed2tkOzLsEYm7hnrb7UJTWHYNsuy3bG/snE= github.com/onsi/gomega v0.0.0-20170829124025-dcabb60a477c/go.mod h1:C1qb7wdrVGGVU+Z6iS04AVkA3Q65CEZX59MT0QO5uiA= github.com/onsi/gomega v1.7.0/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY= github.com/onsi/gomega v1.7.1/go.mod h1:XdKZgCCFLUoM/7CFJVPcG8C1xQ1AJ0vpAezJrB7JYyY= github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1ybHNo= -github.com/onsi/gomega v1.36.2 h1:koNYke6TVk6ZmnyHrCXba/T/MoLBXFjeC1PtvYgw0A8= -github.com/onsi/gomega v1.36.2/go.mod h1:DdwyADRjrc825LhMEkD76cHR5+pUnjhUN8GlHlRPHzY= +github.com/onsi/gomega v1.39.0 h1:y2ROC3hKFmQZJNFeGAMeHZKkjBL65mIZcvrLQBF9k6Q= +github.com/onsi/gomega v1.39.0/go.mod h1:ZCU1pkQcXDO5Sl9/VVEGlDyp+zm0m1cmeG5TOzLgdh4= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug= @@ -485,6 +485,8 @@ go.opentelemetry.io/otel/trace v1.35.0/go.mod h1:WUk7DtFp1Aw2MkvqGdwiXYDZZNvA/1J go.opentelemetry.io/proto/otlp v1.4.0 h1:TA9WRvW6zMwP+Ssb6fLoUIuirti1gGbP28GcKG1jgeg= go.opentelemetry.io/proto/otlp v1.4.0/go.mod h1:PPBWZIP98o2ElSqI35IHfu7hIhSwvc5N38Jw8pXuGFY= go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= +go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs= +go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8= go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= @@ -503,8 +505,8 @@ go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.39.0 h1:SHs+kF4LP+f+p14esP5jAoDpHU8Gu/v9lFRK6IT5imM= -golang.org/x/crypto v0.39.0/go.mod h1:L+Xg3Wf6HoL4Bn4238Z6ft6KfEpN0tJGo53AAPC632U= +golang.org/x/crypto v0.41.0 h1:WKYxWedPGCTVVl5+WHSSrOBT0O8lx32+zxmHxijgXp4= +golang.org/x/crypto v0.41.0/go.mod h1:pO5AFd7FA68rFak7rOAGVuygIISepHftHnr8dr6+sUc= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20250128182459-e0ece0dbea4c h1:KL/ZBHXgKGVmuZBZ01Lt57yE5ws8ZPSkkihmEyq7FXc= golang.org/x/exp v0.0.0-20250128182459-e0ece0dbea4c/go.mod h1:tujkw807nyEEAamNbDrEGzRav+ilXA7PCRAd6xsmwiU= @@ -514,8 +516,8 @@ golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHl golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.25.0 h1:n7a+ZbQKQA/Ysbyb0/6IbB1H/X41mKgbhfv7AfG/44w= -golang.org/x/mod v0.25.0/go.mod h1:IXM97Txy2VM4PJ3gI61r1YEk/gAj6zAHN3AdZt6S9Ww= +golang.org/x/mod v0.27.0 h1:kb+q2PyFnEADO2IEF935ehFUXlWiNjJWtRNgBLSfbxQ= +golang.org/x/mod v0.27.0/go.mod h1:rWI627Fq0DEoudcK+MBkNkCe0EetEaDSwJJkCcjpazc= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -528,8 +530,8 @@ golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/ golang.org/x/net v0.0.0-20200520004742-59133d7f0dd7/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20200707034311-ab3426394381/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw= -golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA= +golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= +golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= @@ -538,8 +540,8 @@ golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.15.0 h1:KWH3jNZsfyT6xfAfKiz6MRNmd46ByHDYaZ7KSkCtdW8= -golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sync v0.16.0 h1:ycBJEhp9p4vXvUZNszeOq0kGTPghopOL8q0fq3vstxw= +golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -555,14 +557,14 @@ golang.org/x/sys v0.0.0-20210112080510-489259a85091/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20211025201205-69cdffdb9359/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220908164124-27713097b956/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= -golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= -golang.org/x/term v0.32.0 h1:DR4lr0TjUs3epypdhTOkMmuF5CDFJ/8pOnbzMZPQ7bg= -golang.org/x/term v0.32.0/go.mod h1:uZG1FhGx848Sqfsq4/DlJr3xGGsYMu/L5GW4abiaEPQ= +golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= +golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/term v0.34.0 h1:O/2T7POpk0ZZ7MAzMeWFSg6S5IpWd/RXDlM9hgM3DR4= +golang.org/x/term v0.34.0/go.mod h1:5jC53AEywhIVebHgPVeg0mj8OD3VO9OzclacVrqpaAw= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.26.0 h1:P42AVeLghgTYr4+xUnTRKDMqpar+PtX7KWuNQL21L8M= -golang.org/x/text v0.26.0/go.mod h1:QK15LZJUUQVJxhz7wXgxSy/CJaTFjd0G+YLonydOVQA= +golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng= +golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU= golang.org/x/time v0.12.0 h1:ScB/8o8olJvc+CQPWrK3fPZNfh7qgwCrY0zJmoEQLSE= golang.org/x/time v0.12.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= @@ -577,8 +579,8 @@ golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtn golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20201224043029-2b0845dc783e/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.33.0 h1:4qz2S3zmRxbGIhDIAgjxvFutSvH5EfnsYrRBj0UI0bc= -golang.org/x/tools v0.33.0/go.mod h1:CIJMaWEY88juyUfo7UbgPqbC8rU2OqfAV1h2Qp0oMYI= +golang.org/x/tools v0.36.0 h1:kWS0uv/zsvHEle1LbV5LE8QujrxB3wfQyxHfhOk0Qkg= +golang.org/x/tools v0.36.0/go.mod h1:WBDiHKJK8YgLHlcQPYQzNCkUxUypCaa5ZegCVutKm+s= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -613,8 +615,8 @@ google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2 google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4= -google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= -google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +google.golang.org/protobuf v1.36.7 h1:IgrO7UwFQGJdRNXH/sQux4R1Dj1WAKcLElzeeRaXV2A= +google.golang.org/protobuf v1.36.7/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= gopkg.in/cenkalti/backoff.v1 v1.1.0 h1:Arh75ttbsvlpVA7WtVpH4u9h6Zl46xuptxqLxPiSo4Y= gopkg.in/cenkalti/backoff.v1 v1.1.0/go.mod h1:J6Vskwqd+OMVJl8C33mmtxTBs2gyzfv7UDAkHu8BrjI= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= diff --git a/scheduler/cmd/scheduler/main.go b/scheduler/cmd/scheduler/main.go index b1db2df665..359fa56400 100644 --- a/scheduler/cmd/scheduler/main.go +++ b/scheduler/cmd/scheduler/main.go @@ -23,13 +23,13 @@ import ( "syscall" "time" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" log "github.com/sirupsen/logrus" "google.golang.org/grpc" "google.golang.org/grpc/backoff" "google.golang.org/grpc/credentials/insecure" "github.com/seldonio/seldon-core/apis/go/v2/mlops/health" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" kafka_config "github.com/seldonio/seldon-core/components/kafka/v2/pkg/config" "github.com/seldonio/seldon-core/components/tls/v2/pkg/tls" diff --git a/scheduler/go.mod b/scheduler/go.mod index 2b89ecdf40..5210c5f0fb 100644 --- a/scheduler/go.mod +++ b/scheduler/go.mod @@ -24,7 +24,7 @@ require ( github.com/knadh/koanf/v2 v2.3.0 github.com/mitchellh/copystructure v1.2.0 github.com/mustafaturan/bus/v3 v3.0.3 - github.com/onsi/gomega v1.36.2 + github.com/onsi/gomega v1.39.0 github.com/orcaman/concurrent-map v1.0.0 github.com/otiai10/copy v1.14.1 github.com/prometheus/client_golang v1.22.0 @@ -46,11 +46,12 @@ require ( go.opentelemetry.io/otel/trace v1.37.0 go.uber.org/mock v0.4.0 google.golang.org/grpc v1.73.0 - google.golang.org/protobuf v1.36.6 + google.golang.org/protobuf v1.36.7 gopkg.in/yaml.v2 v2.4.0 k8s.io/api v0.33.2 k8s.io/apimachinery v0.33.2 k8s.io/client-go v0.33.2 + k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 knative.dev/pkg v0.0.0-20250702180455-68cdb02d48c8 sigs.k8s.io/controller-runtime v0.21.0 ) @@ -120,16 +121,16 @@ require ( go.opentelemetry.io/proto/otlp v1.7.0 // indirect go.yaml.in/yaml/v2 v2.4.2 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect - golang.org/x/crypto v0.39.0 // indirect - golang.org/x/mod v0.25.0 // indirect - golang.org/x/net v0.41.0 // indirect + golang.org/x/crypto v0.41.0 // indirect + golang.org/x/mod v0.27.0 // indirect + golang.org/x/net v0.43.0 // indirect golang.org/x/oauth2 v0.30.0 // indirect golang.org/x/sync v0.16.0 // indirect golang.org/x/sys v0.36.0 // indirect - golang.org/x/term v0.32.0 // indirect - golang.org/x/text v0.26.0 // indirect + golang.org/x/term v0.34.0 // indirect + golang.org/x/text v0.28.0 // indirect golang.org/x/time v0.12.0 // indirect - golang.org/x/tools v0.34.0 // indirect + golang.org/x/tools v0.36.0 // indirect gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect google.golang.org/genproto v0.0.0-20240325203815-454cdb8f5daa // indirect google.golang.org/genproto/googleapis/api v0.0.0-20250603155806-513f23925822 // indirect @@ -141,7 +142,6 @@ require ( k8s.io/apiextensions-apiserver v0.33.2 // indirect k8s.io/klog/v2 v2.130.1 // indirect k8s.io/kube-openapi v0.0.0-20250701173324-9bd5c66d9911 // indirect - k8s.io/utils v0.0.0-20250604170112-4c0f3b243397 // indirect sigs.k8s.io/json v0.0.0-20241014173422-cfa47c3a1cc8 // indirect sigs.k8s.io/randfill v1.0.0 // indirect sigs.k8s.io/structured-merge-diff/v4 v4.7.0 // indirect diff --git a/scheduler/go.sum b/scheduler/go.sum index 3d39d46154..10aa9803f9 100644 --- a/scheduler/go.sum +++ b/scheduler/go.sum @@ -30,8 +30,8 @@ github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg6 github.com/AzureAD/microsoft-authentication-library-for-go v1.2.2 h1:XHOnouVk1mxXfQidrMEnLlPk9UMeRtyBTnEFtxkV0kU= github.com/AzureAD/microsoft-authentication-library-for-go v1.2.2/go.mod h1:wP83P5OoQ5p6ip3ScPr0BAq0BvuPAvacpEuSzyouqAI= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= -github.com/Masterminds/semver/v3 v3.2.1 h1:RN9w6+7QoMeJVGyfmbcgs28Br8cvmnucEXnY0rYXWg0= -github.com/Masterminds/semver/v3 v3.2.1/go.mod h1:qvl/7zhW3nngYb5+80sSMF+FG2BjYrf8m9wsX0PNOMQ= +github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= +github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= github.com/Microsoft/hcsshim v0.11.5 h1:haEcLNpj9Ka1gd3B3tAEs9CpE0c+1IhoL59w/exYU38= @@ -281,8 +281,8 @@ github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad h1:a6HEuzUHeKH6hwfN/ZoQgRgVIWFJljSWa/zetS2WTvg= -github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144= +github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8= +github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= github.com/google/s2a-go v0.1.9 h1:LGD7gtMgezd8a/Xak7mEWL0PjoTQFvpRudN895yqKW0= github.com/google/s2a-go v0.1.9/go.mod h1:YA0Ei2ZQL3acow2O62kdp9UlnvMmU7kA6Eutn0dXayM= github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 h1:El6M4kTTCOh6aBiKaUGG7oYTSPP8MxqL4YI3kZKwcP4= @@ -433,10 +433,10 @@ github.com/mustafaturan/bus/v3 v3.0.3 h1:PMEUVKpfI9FOUw32o3wAHRaBS1XGxh6cFCy/VHk github.com/mustafaturan/bus/v3 v3.0.3/go.mod h1:JVCyq6Pb6S/IGI6LrzKH5vlBZ9ifsd1Js+wd4Y2+7Xg= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= -github.com/onsi/ginkgo/v2 v2.22.1 h1:QW7tbJAUDyVDVOM5dFa7qaybo+CRfR7bemlQUN6Z8aM= -github.com/onsi/ginkgo/v2 v2.22.1/go.mod h1:S6aTpoRsSq2cZOd+pssHAlKW/Q/jZt6cPrPlnj4a1xM= -github.com/onsi/gomega v1.36.2 h1:koNYke6TVk6ZmnyHrCXba/T/MoLBXFjeC1PtvYgw0A8= -github.com/onsi/gomega v1.36.2/go.mod h1:DdwyADRjrc825LhMEkD76cHR5+pUnjhUN8GlHlRPHzY= +github.com/onsi/ginkgo/v2 v2.25.3 h1:Ty8+Yi/ayDAGtk4XxmmfUy4GabvM+MegeB4cDLRi6nw= +github.com/onsi/ginkgo/v2 v2.25.3/go.mod h1:43uiyQC4Ed2tkOzLsEYm7hnrb7UJTWHYNsuy3bG/snE= +github.com/onsi/gomega v1.39.0 h1:y2ROC3hKFmQZJNFeGAMeHZKkjBL65mIZcvrLQBF9k6Q= +github.com/onsi/gomega v1.39.0/go.mod h1:ZCU1pkQcXDO5Sl9/VVEGlDyp+zm0m1cmeG5TOzLgdh4= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opencontainers/image-spec v1.1.0 h1:8SG7/vwALn54lVB/0yZ/MMwhFrPYtpEHQb2IpWsCzug= @@ -613,6 +613,8 @@ go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXe go.opentelemetry.io/proto/otlp v1.7.0 h1:jX1VolD6nHuFzOYso2E73H85i92Mv8JQYk0K9vz09os= go.opentelemetry.io/proto/otlp v1.7.0/go.mod h1:fSKjH6YJ7HDlwzltzyMj036AJ3ejJLCgCSHGj4efDDo= go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= +go.uber.org/automaxprocs v1.6.0 h1:O3y2/QNTOdbF+e/dpXNNW7Rx2hZ4sTIPyybbxyNqTUs= +go.uber.org/automaxprocs v1.6.0/go.mod h1:ifeIMSnPZuznNm6jmdzmU3/bfk01Fe2fotchwEFJ8r8= go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= @@ -632,8 +634,8 @@ golang.org/x/crypto v0.0.0-20181203042331-505ab145d0a9/go.mod h1:6SG95UA2DQfeDnf golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.39.0 h1:SHs+kF4LP+f+p14esP5jAoDpHU8Gu/v9lFRK6IT5imM= -golang.org/x/crypto v0.39.0/go.mod h1:L+Xg3Wf6HoL4Bn4238Z6ft6KfEpN0tJGo53AAPC632U= +golang.org/x/crypto v0.41.0 h1:WKYxWedPGCTVVl5+WHSSrOBT0O8lx32+zxmHxijgXp4= +golang.org/x/crypto v0.41.0/go.mod h1:pO5AFd7FA68rFak7rOAGVuygIISepHftHnr8dr6+sUc= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20250210185358-939b2ce775ac h1:l5+whBCLH3iH2ZNHYLbAe58bo7yrN4mVcnkHDYz5vvs= golang.org/x/exp v0.0.0-20250210185358-939b2ce775ac/go.mod h1:hH+7mtFmImwwcMvScyxUhjuVHR3HGaDPMn9rMSUUbxo= @@ -643,8 +645,8 @@ golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHl golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.25.0 h1:n7a+ZbQKQA/Ysbyb0/6IbB1H/X41mKgbhfv7AfG/44w= -golang.org/x/mod v0.25.0/go.mod h1:IXM97Txy2VM4PJ3gI61r1YEk/gAj6zAHN3AdZt6S9Ww= +golang.org/x/mod v0.27.0 h1:kb+q2PyFnEADO2IEF935ehFUXlWiNjJWtRNgBLSfbxQ= +golang.org/x/mod v0.27.0/go.mod h1:rWI627Fq0DEoudcK+MBkNkCe0EetEaDSwJJkCcjpazc= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -654,8 +656,8 @@ golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLL golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw= -golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA= +golang.org/x/net v0.43.0 h1:lat02VYK2j4aLzMzecihNvTlJNQUq316m2Mr9rnM6YE= +golang.org/x/net v0.43.0/go.mod h1:vhO1fvI4dGsIjh73sWfUVjj3N7CA9WkKJNQm2svM6Jg= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= @@ -679,12 +681,12 @@ golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20221010170243-090e33056c14/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.36.0 h1:KVRy2GtZBrk1cBYA7MKu5bEZFxQk4NIDV6RLVcC8o0k= golang.org/x/sys v0.36.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= -golang.org/x/term v0.32.0 h1:DR4lr0TjUs3epypdhTOkMmuF5CDFJ/8pOnbzMZPQ7bg= -golang.org/x/term v0.32.0/go.mod h1:uZG1FhGx848Sqfsq4/DlJr3xGGsYMu/L5GW4abiaEPQ= +golang.org/x/term v0.34.0 h1:O/2T7POpk0ZZ7MAzMeWFSg6S5IpWd/RXDlM9hgM3DR4= +golang.org/x/term v0.34.0/go.mod h1:5jC53AEywhIVebHgPVeg0mj8OD3VO9OzclacVrqpaAw= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.26.0 h1:P42AVeLghgTYr4+xUnTRKDMqpar+PtX7KWuNQL21L8M= -golang.org/x/text v0.26.0/go.mod h1:QK15LZJUUQVJxhz7wXgxSy/CJaTFjd0G+YLonydOVQA= +golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng= +golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU= golang.org/x/time v0.12.0 h1:ScB/8o8olJvc+CQPWrK3fPZNfh7qgwCrY0zJmoEQLSE= golang.org/x/time v0.12.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= @@ -696,8 +698,8 @@ golang.org/x/tools v0.0.0-20191108193012-7d206e10da11/go.mod h1:b+2E5dAYhXwXZwtn golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= -golang.org/x/tools v0.34.0 h1:qIpSLOxeCYGg9TrcJokLBG4KFA6d795g0xkBkiESGlo= -golang.org/x/tools v0.34.0/go.mod h1:pAP9OwEaY1CAW3HOmg3hLZC5Z0CCmzjAF2UQMSqNARg= +golang.org/x/tools v0.36.0 h1:kWS0uv/zsvHEle1LbV5LE8QujrxB3wfQyxHfhOk0Qkg= +golang.org/x/tools v0.36.0/go.mod h1:WBDiHKJK8YgLHlcQPYQzNCkUxUypCaa5ZegCVutKm+s= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -737,8 +739,8 @@ google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2 google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= -google.golang.org/protobuf v1.36.6 h1:z1NpPI8ku2WgiWnf+t9wTPsn6eP1L7ksHUlkfLvd9xY= -google.golang.org/protobuf v1.36.6/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= +google.golang.org/protobuf v1.36.7 h1:IgrO7UwFQGJdRNXH/sQux4R1Dj1WAKcLElzeeRaXV2A= +google.golang.org/protobuf v1.36.7/go.mod h1:jduwjTPXsFjZGTmRluh+L6NjiWu7pchiJ2/5YcXBHnY= gopkg.in/cenkalti/backoff.v1 v1.1.0 h1:Arh75ttbsvlpVA7WtVpH4u9h6Zl46xuptxqLxPiSo4Y= gopkg.in/cenkalti/backoff.v1 v1.1.0/go.mod h1:J6Vskwqd+OMVJl8C33mmtxTBs2gyzfv7UDAkHu8BrjI= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= diff --git a/scheduler/pkg/agent/server.go b/scheduler/pkg/agent/server.go index 7f2bf41c9a..28c19961d0 100644 --- a/scheduler/pkg/agent/server.go +++ b/scheduler/pkg/agent/server.go @@ -19,7 +19,6 @@ import ( "sync" "time" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" log "github.com/sirupsen/logrus" "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" "google.golang.org/grpc" @@ -29,6 +28,7 @@ import ( pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/agent" "github.com/seldonio/seldon-core/apis/go/v2/mlops/health" pbs "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" seldontls "github.com/seldonio/seldon-core/components/tls/v2/pkg/tls" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" diff --git a/scheduler/pkg/agent/server_test.go b/scheduler/pkg/agent/server_test.go index 5d787e75f9..e297b1a31e 100644 --- a/scheduler/pkg/agent/server_test.go +++ b/scheduler/pkg/agent/server_test.go @@ -17,9 +17,6 @@ import ( "time" . "github.com/onsi/gomega" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" - "github.com/seldonio/seldon-core/scheduler/v2/pkg/internal/testing_utils" - "github.com/seldonio/seldon-core/scheduler/v2/pkg/scheduler/mock" log "github.com/sirupsen/logrus" "go.uber.org/mock/gomock" "google.golang.org/grpc" @@ -28,9 +25,12 @@ import ( pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/agent" pbs "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/seldonio/seldon-core/components/tls/v2/pkg/tls" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/internal/testing_utils" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/scheduler/mock" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" "github.com/seldonio/seldon-core/scheduler/v2/pkg/util" ) @@ -57,7 +57,7 @@ func TestSync(t *testing.T) { type ExpectedVersionState struct { version uint32 - expectedStates map[int]db.ReplicaStatus + expectedStates map[int]*db.ReplicaStatus } type test struct { name string @@ -96,7 +96,7 @@ func TestSync(t *testing.T) { expectedVersionStates: []ExpectedVersionState{ { version: 1, - expectedStates: map[int]db.ReplicaStatus{ + expectedStates: map[int]*db.ReplicaStatus{ 1: {State: db.ModelReplicaState_Loading}, }, }, @@ -130,7 +130,7 @@ func TestSync(t *testing.T) { expectedVersionStates: []ExpectedVersionState{ { version: 1, - expectedStates: map[int]db.ReplicaStatus{ + expectedStates: map[int]*db.ReplicaStatus{ 1: {State: db.ModelReplicaState_LoadFailed}, }, }, @@ -164,7 +164,7 @@ func TestSync(t *testing.T) { expectedVersionStates: []ExpectedVersionState{ { version: 1, - expectedStates: map[int]db.ReplicaStatus{ + expectedStates: map[int]*db.ReplicaStatus{ 1: {State: db.ModelReplicaState_LoadFailed}, }, }, @@ -198,7 +198,7 @@ func TestSync(t *testing.T) { expectedVersionStates: []ExpectedVersionState{ { version: 1, - expectedStates: map[int]db.ReplicaStatus{ + expectedStates: map[int]*db.ReplicaStatus{ 1: {State: db.ModelReplicaState_Unloading}, }, }, @@ -232,7 +232,7 @@ func TestSync(t *testing.T) { expectedVersionStates: []ExpectedVersionState{ { version: 1, - expectedStates: map[int]db.ReplicaStatus{ + expectedStates: map[int]*db.ReplicaStatus{ 1: {State: db.ModelReplicaState_UnloadFailed}, }, }, @@ -270,13 +270,13 @@ func TestSync(t *testing.T) { expectedVersionStates: []ExpectedVersionState{ { version: 1, - expectedStates: map[int]db.ReplicaStatus{ + expectedStates: map[int]*db.ReplicaStatus{ 1: {State: db.ModelReplicaState_Unloading}, }, }, { version: 2, - expectedStates: map[int]db.ReplicaStatus{ + expectedStates: map[int]*db.ReplicaStatus{ 1: {State: db.ModelReplicaState_Loading}, }, }, diff --git a/scheduler/pkg/envoy/processor/incremental.go b/scheduler/pkg/envoy/processor/incremental.go index 784540df65..26784d513d 100644 --- a/scheduler/pkg/envoy/processor/incremental.go +++ b/scheduler/pkg/envoy/processor/incremental.go @@ -16,9 +16,10 @@ import ( "sync" "time" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/sirupsen/logrus" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" "github.com/seldonio/seldon-core/scheduler/v2/pkg/envoy/xdscache" "github.com/seldonio/seldon-core/scheduler/v2/pkg/scheduler/cleaner" @@ -725,7 +726,7 @@ func (p *IncrementalProcessor) modelSync() { nil, ); err != nil { logger.WithError(err).Warnf("Failed to update replica state for model %s to %s from %s", - mv.name, store.UnloadRequested.String(), serverReplicaExpectedState.String()) + mv.name, db.ModelReplicaState_UnloadRequested.String(), serverReplicaExpectedState.String()) } } diff --git a/scheduler/pkg/envoy/processor/incremental_benchmark_test.go b/scheduler/pkg/envoy/processor/incremental_benchmark_test.go index ba4c7aad62..c9cbbc166b 100644 --- a/scheduler/pkg/envoy/processor/incremental_benchmark_test.go +++ b/scheduler/pkg/envoy/processor/incremental_benchmark_test.go @@ -15,12 +15,12 @@ import ( "testing" "time" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/sirupsen/logrus" "github.com/stretchr/testify/require" "github.com/seldonio/seldon-core/apis/go/v2/mlops/agent" "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" "github.com/seldonio/seldon-core/scheduler/v2/pkg/envoy/xdscache" diff --git a/scheduler/pkg/envoy/processor/incremental_test.go b/scheduler/pkg/envoy/processor/incremental_test.go index 2decda813d..1a2f280ff8 100644 --- a/scheduler/pkg/envoy/processor/incremental_test.go +++ b/scheduler/pkg/envoy/processor/incremental_test.go @@ -24,12 +24,12 @@ import ( "github.com/envoyproxy/go-control-plane/pkg/cache/types" . "github.com/onsi/gomega" "github.com/onsi/gomega/format" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" log "github.com/sirupsen/logrus" "google.golang.org/protobuf/encoding/protojson" pba "github.com/seldonio/seldon-core/apis/go/v2/mlops/agent" "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" "github.com/seldonio/seldon-core/scheduler/v2/pkg/envoy/xdscache" diff --git a/scheduler/pkg/envoy/processor/server_test.go b/scheduler/pkg/envoy/processor/server_test.go index 01eb769bb5..71aba45d84 100644 --- a/scheduler/pkg/envoy/processor/server_test.go +++ b/scheduler/pkg/envoy/processor/server_test.go @@ -21,13 +21,14 @@ import ( client "github.com/envoyproxy/go-control-plane/pkg/client/sotw/v3" resource "github.com/envoyproxy/go-control-plane/pkg/resource/v3" . "github.com/onsi/gomega" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" log "github.com/sirupsen/logrus" "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" "google.golang.org/protobuf/proto" "google.golang.org/protobuf/types/known/anypb" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/envoy/xdscache" "github.com/seldonio/seldon-core/scheduler/v2/pkg/internal/testing_utils" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" diff --git a/scheduler/pkg/envoy/xdscache/seldoncache.go b/scheduler/pkg/envoy/xdscache/seldoncache.go index 88d2ab91d2..ed68af3b5f 100644 --- a/scheduler/pkg/envoy/xdscache/seldoncache.go +++ b/scheduler/pkg/envoy/xdscache/seldoncache.go @@ -21,9 +21,9 @@ import ( "github.com/envoyproxy/go-control-plane/pkg/cache/v3" "github.com/envoyproxy/go-control-plane/pkg/resource/v3" "github.com/envoyproxy/go-control-plane/pkg/server/stream/v3" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/sirupsen/logrus" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" seldontls "github.com/seldonio/seldon-core/components/tls/v2/pkg/tls" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" diff --git a/scheduler/pkg/envoy/xdscache/seldoncache_test.go b/scheduler/pkg/envoy/xdscache/seldoncache_test.go index fb54c85813..5229321e46 100644 --- a/scheduler/pkg/envoy/xdscache/seldoncache_test.go +++ b/scheduler/pkg/envoy/xdscache/seldoncache_test.go @@ -22,10 +22,10 @@ import ( matcherv3 "github.com/envoyproxy/go-control-plane/envoy/type/matcher/v3" . "github.com/onsi/gomega" "github.com/otiai10/copy" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" log "github.com/sirupsen/logrus" "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" seldontls "github.com/seldonio/seldon-core/components/tls/v2/pkg/tls" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" diff --git a/scheduler/pkg/kafka/conflict-resolution/conflict_resolution.go b/scheduler/pkg/kafka/conflict-resolution/conflict_resolution.go index d00f66e2bd..80d33c6eb6 100644 --- a/scheduler/pkg/kafka/conflict-resolution/conflict_resolution.go +++ b/scheduler/pkg/kafka/conflict-resolution/conflict_resolution.go @@ -12,11 +12,11 @@ package conflict_resolution import ( "fmt" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" log "github.com/sirupsen/logrus" "github.com/seldonio/seldon-core/apis/go/v2/mlops/chainer" pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/pipeline" ) diff --git a/scheduler/pkg/kafka/dataflow/server_test.go b/scheduler/pkg/kafka/dataflow/server_test.go index 828a18ec0b..aaa925ef05 100644 --- a/scheduler/pkg/kafka/dataflow/server_test.go +++ b/scheduler/pkg/kafka/dataflow/server_test.go @@ -19,7 +19,6 @@ import ( "time" . "github.com/onsi/gomega" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" log "github.com/sirupsen/logrus" "go.uber.org/mock/gomock" "google.golang.org/grpc" @@ -27,6 +26,7 @@ import ( "github.com/seldonio/seldon-core/apis/go/v2/mlops/chainer" "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" kafka_config "github.com/seldonio/seldon-core/components/kafka/v2/pkg/config" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" diff --git a/scheduler/pkg/scheduler/cleaner/versions.go b/scheduler/pkg/scheduler/cleaner/versions.go index 14ee3ad2eb..a9863ff217 100644 --- a/scheduler/pkg/scheduler/cleaner/versions.go +++ b/scheduler/pkg/scheduler/cleaner/versions.go @@ -12,9 +12,10 @@ package cleaner import ( "fmt" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" log "github.com/sirupsen/logrus" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" ) diff --git a/scheduler/pkg/scheduler/filters/deletedserver_test.go b/scheduler/pkg/scheduler/filters/deletedserver_test.go index e1cf4e3865..815707d0ce 100644 --- a/scheduler/pkg/scheduler/filters/deletedserver_test.go +++ b/scheduler/pkg/scheduler/filters/deletedserver_test.go @@ -13,10 +13,11 @@ import ( "testing" . "github.com/onsi/gomega" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" - "github.com/seldonio/seldon-core/scheduler/v2/pkg/util" pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + + "github.com/seldonio/seldon-core/scheduler/v2/pkg/util" ) func TestDeletedServerFilter(t *testing.T) { diff --git a/scheduler/pkg/scheduler/filters/replicadraining_test.go b/scheduler/pkg/scheduler/filters/replicadraining_test.go index dd50e8b719..442a87f363 100644 --- a/scheduler/pkg/scheduler/filters/replicadraining_test.go +++ b/scheduler/pkg/scheduler/filters/replicadraining_test.go @@ -13,6 +13,7 @@ import ( "testing" . "github.com/onsi/gomega" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" ) diff --git a/scheduler/pkg/scheduler/filters/replicamemory_test.go b/scheduler/pkg/scheduler/filters/replicamemory_test.go index 86fc0a6c2a..0c5773f50a 100644 --- a/scheduler/pkg/scheduler/filters/replicamemory_test.go +++ b/scheduler/pkg/scheduler/filters/replicamemory_test.go @@ -13,11 +13,12 @@ import ( "testing" . "github.com/onsi/gomega" + + pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" "github.com/seldonio/seldon-core/scheduler/v2/pkg/util" - - pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" ) func getTestModelWithMemory(requiredmemory *uint64, serverName string, replicaId int) *db.ModelVersion { diff --git a/scheduler/pkg/scheduler/filters/serverreplicas_test.go b/scheduler/pkg/scheduler/filters/serverreplicas_test.go index b8b88430a7..6607452c02 100644 --- a/scheduler/pkg/scheduler/filters/serverreplicas_test.go +++ b/scheduler/pkg/scheduler/filters/serverreplicas_test.go @@ -13,10 +13,11 @@ import ( "testing" . "github.com/onsi/gomega" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" - "github.com/seldonio/seldon-core/scheduler/v2/pkg/util" pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + + "github.com/seldonio/seldon-core/scheduler/v2/pkg/util" ) func TestServerReplicasFilter(t *testing.T) { diff --git a/scheduler/pkg/scheduler/filters/serverrequirements_test.go b/scheduler/pkg/scheduler/filters/serverrequirements_test.go index b47b5b820c..ee8d7a0ab3 100644 --- a/scheduler/pkg/scheduler/filters/serverrequirements_test.go +++ b/scheduler/pkg/scheduler/filters/serverrequirements_test.go @@ -13,11 +13,12 @@ import ( "testing" . "github.com/onsi/gomega" + + pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" "github.com/seldonio/seldon-core/scheduler/v2/pkg/util" - - pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" ) func TestServerRequirementFilter(t *testing.T) { diff --git a/scheduler/pkg/scheduler/filters/sharing_test.go b/scheduler/pkg/scheduler/filters/sharing_test.go index b052b7b58d..c2e5164976 100644 --- a/scheduler/pkg/scheduler/filters/sharing_test.go +++ b/scheduler/pkg/scheduler/filters/sharing_test.go @@ -13,10 +13,11 @@ import ( "testing" . "github.com/onsi/gomega" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" - "github.com/seldonio/seldon-core/scheduler/v2/pkg/util" pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + + "github.com/seldonio/seldon-core/scheduler/v2/pkg/util" ) func TestSharingFilter(t *testing.T) { diff --git a/scheduler/pkg/scheduler/mock/interface.go b/scheduler/pkg/scheduler/mock/interface.go index 0149fc822d..200c6308bf 100644 --- a/scheduler/pkg/scheduler/mock/interface.go +++ b/scheduler/pkg/scheduler/mock/interface.go @@ -1,3 +1,12 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed BY +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + // Code generated by MockGen. DO NOT EDIT. // Source: ./interface.go // diff --git a/scheduler/pkg/scheduler/scheduler.go b/scheduler/pkg/scheduler/scheduler.go index 98d2f0c025..3f9a5686c2 100644 --- a/scheduler/pkg/scheduler/scheduler.go +++ b/scheduler/pkg/scheduler/scheduler.go @@ -17,9 +17,10 @@ import ( "strings" "sync" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" log "github.com/sirupsen/logrus" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" "github.com/seldonio/seldon-core/scheduler/v2/pkg/scheduler/filters" "github.com/seldonio/seldon-core/scheduler/v2/pkg/scheduler/sorters" diff --git a/scheduler/pkg/scheduler/scheduler_test.go b/scheduler/pkg/scheduler/scheduler_test.go index 4db4941e48..c3756147e4 100644 --- a/scheduler/pkg/scheduler/scheduler_test.go +++ b/scheduler/pkg/scheduler/scheduler_test.go @@ -20,8 +20,6 @@ import ( "github.com/gotidy/ptr" . "github.com/onsi/gomega" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" - "github.com/seldonio/seldon-core/scheduler/v2/pkg/util" log "github.com/sirupsen/logrus" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -29,12 +27,14 @@ import ( ptr2 "k8s.io/utils/ptr" pbs "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/mock" "github.com/seldonio/seldon-core/scheduler/v2/pkg/synchroniser" mock2 "github.com/seldonio/seldon-core/scheduler/v2/pkg/synchroniser/mock" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/util" ) func TestScheduler(t *testing.T) { @@ -1183,7 +1183,6 @@ func TestFailedModels(t *testing.T) { type test struct { name string - models []*db.Model setupMock func(m *mock.MockModelServerAPI) expectedFailedModels []string } diff --git a/scheduler/pkg/scheduler/sorters/loaded_test.go b/scheduler/pkg/scheduler/sorters/loaded_test.go index 829922eabc..ff1f1e05d0 100644 --- a/scheduler/pkg/scheduler/sorters/loaded_test.go +++ b/scheduler/pkg/scheduler/sorters/loaded_test.go @@ -14,7 +14,9 @@ import ( "testing" . "github.com/onsi/gomega" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" "github.com/seldonio/seldon-core/scheduler/v2/pkg/util" ) diff --git a/scheduler/pkg/scheduler/sorters/replicaindex_test.go b/scheduler/pkg/scheduler/sorters/replicaindex_test.go index 7b4fc12384..a6a88f5cc9 100644 --- a/scheduler/pkg/scheduler/sorters/replicaindex_test.go +++ b/scheduler/pkg/scheduler/sorters/replicaindex_test.go @@ -14,7 +14,9 @@ import ( "testing" . "github.com/onsi/gomega" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" "github.com/seldonio/seldon-core/scheduler/v2/pkg/util" ) diff --git a/scheduler/pkg/scheduler/sorters/replicamemory_test.go b/scheduler/pkg/scheduler/sorters/replicamemory_test.go index 1388f7c2e3..a312d7e347 100644 --- a/scheduler/pkg/scheduler/sorters/replicamemory_test.go +++ b/scheduler/pkg/scheduler/sorters/replicamemory_test.go @@ -14,7 +14,9 @@ import ( "testing" . "github.com/onsi/gomega" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" "github.com/seldonio/seldon-core/scheduler/v2/pkg/util" ) diff --git a/scheduler/pkg/server/control_plane_test.go b/scheduler/pkg/server/control_plane_test.go index 1fd8372e5a..ff876901e4 100644 --- a/scheduler/pkg/server/control_plane_test.go +++ b/scheduler/pkg/server/control_plane_test.go @@ -16,12 +16,12 @@ import ( "time" . "github.com/onsi/gomega" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" log "github.com/sirupsen/logrus" "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/seldonio/seldon-core/components/tls/v2/pkg/tls" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" diff --git a/scheduler/pkg/server/server.go b/scheduler/pkg/server/server.go index bd571812bf..d8d2d3fe15 100644 --- a/scheduler/pkg/server/server.go +++ b/scheduler/pkg/server/server.go @@ -19,16 +19,17 @@ import ( "sync" "time" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/health" - pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" - seldontls "github.com/seldonio/seldon-core/components/tls/v2/pkg/tls" log "github.com/sirupsen/logrus" "go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc" "google.golang.org/grpc" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/health" + pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + seldontls "github.com/seldonio/seldon-core/components/tls/v2/pkg/tls" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" cr "github.com/seldonio/seldon-core/scheduler/v2/pkg/kafka/conflict-resolution" scaling_config "github.com/seldonio/seldon-core/scheduler/v2/pkg/scaling/config" diff --git a/scheduler/pkg/server/server_status.go b/scheduler/pkg/server/server_status.go index 29dbf34760..25ca383a57 100644 --- a/scheduler/pkg/server/server_status.go +++ b/scheduler/pkg/server/server_status.go @@ -14,10 +14,10 @@ import ( "fmt" "time" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/sirupsen/logrus" pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" cr "github.com/seldonio/seldon-core/scheduler/v2/pkg/kafka/conflict-resolution" diff --git a/scheduler/pkg/server/server_status_test.go b/scheduler/pkg/server/server_status_test.go index a26964ee4d..97314f7396 100644 --- a/scheduler/pkg/server/server_status_test.go +++ b/scheduler/pkg/server/server_status_test.go @@ -16,19 +16,19 @@ import ( "time" . "github.com/onsi/gomega" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" - "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" log "github.com/sirupsen/logrus" "go.uber.org/mock/gomock" "google.golang.org/protobuf/proto" pba "github.com/seldonio/seldon-core/apis/go/v2/mlops/agent" pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/seldonio/seldon-core/components/tls/v2/pkg/tls" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" scheduler2 "github.com/seldonio/seldon-core/scheduler/v2/pkg/scheduler" "github.com/seldonio/seldon-core/scheduler/v2/pkg/scheduler/cleaner" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/experiment" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/mock" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/pipeline" diff --git a/scheduler/pkg/server/server_test.go b/scheduler/pkg/server/server_test.go index 396cdf72ac..b408e70572 100644 --- a/scheduler/pkg/server/server_test.go +++ b/scheduler/pkg/server/server_test.go @@ -18,7 +18,6 @@ import ( "time" . "github.com/onsi/gomega" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" log "github.com/sirupsen/logrus" "google.golang.org/grpc" "google.golang.org/grpc/codes" @@ -27,6 +26,7 @@ import ( pba "github.com/seldonio/seldon-core/apis/go/v2/mlops/agent" pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/seldonio/seldon-core/components/tls/v2/pkg/tls" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" diff --git a/scheduler/pkg/server/utils.go b/scheduler/pkg/server/utils.go index 3c358a7deb..0b1dfe8116 100644 --- a/scheduler/pkg/server/utils.go +++ b/scheduler/pkg/server/utils.go @@ -15,10 +15,11 @@ import ( "math/rand/v2" "time" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" ) diff --git a/scheduler/pkg/server/utils_test.go b/scheduler/pkg/server/utils_test.go index 0e81bb73ca..0e2b9650e6 100644 --- a/scheduler/pkg/server/utils_test.go +++ b/scheduler/pkg/server/utils_test.go @@ -15,6 +15,7 @@ import ( "time" . "github.com/onsi/gomega" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" diff --git a/scheduler/pkg/store/api.go b/scheduler/pkg/store/api.go index 24b51c9a92..c0ad39bf26 100644 --- a/scheduler/pkg/store/api.go +++ b/scheduler/pkg/store/api.go @@ -15,127 +15,11 @@ import ( "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" ) -type ServerSnapshot struct { - Name string - Replicas map[int]*ServerReplica - Shared bool - ExpectedReplicas int - MinReplicas int - MaxReplicas int - KubernetesMeta *pb.KubernetesMeta - Stats *ServerStats -} - type ServerStats struct { NumEmptyReplicas uint32 MaxNumReplicaHostedModels uint32 } -func (s *ServerSnapshot) String() string { - return s.Name -} - -type ModelSnapshot struct { - Name string - Versions []*ModelVersion - Deleted bool -} - -func (m *ModelSnapshot) GetLatest() *ModelVersion { - if len(m.Versions) > 0 { - return m.Versions[len(m.Versions)-1] - } else { - return nil - } -} - -func (m *ModelSnapshot) GetVersion(version uint32) *ModelVersion { - for _, mv := range m.Versions { - if mv.GetVersion() == version { - return mv - } - } - return nil -} - -func (m *ModelSnapshot) GetPrevious() *ModelVersion { - if len(m.Versions) > 1 { - return m.Versions[len(m.Versions)-2] - } else { - return nil - } -} - -func (m *ModelSnapshot) getLastAvailableModelIdx() int { - if m == nil { // TODO Make safe by not working on actual object - return -1 - } - lastAvailableIdx := -1 - for idx, mv := range m.Versions { - if mv.state.State == ModelAvailable { - lastAvailableIdx = idx - } - } - return lastAvailableIdx -} - -func (m *ModelSnapshot) getLastModelGwAvailableModelIdx() int { - if m == nil { // TODO Make safe by not working on actual object - return -1 - } - lastAvailableIdx := -1 - for idx, mv := range m.Versions { - if mv.state.ModelGwState == ModelAvailable { - lastAvailableIdx = idx - } - } - return lastAvailableIdx -} - -func (m *ModelSnapshot) CanReceiveTraffic() bool { - if m.GetLastAvailableModel() != nil { - return true - } - latestVersion := m.GetLatest() - if latestVersion != nil && latestVersion.HasLiveReplicas() { - return true - } - return false -} - -func (m *ModelSnapshot) GetLastAvailableModel() *ModelVersion { - if m == nil { // TODO Make safe by not working on actual object - return nil - } - lastAvailableIdx := m.getLastAvailableModelIdx() - if lastAvailableIdx != -1 { - return m.Versions[lastAvailableIdx] - } - return nil -} - -func (m *ModelSnapshot) GetVersionsBeforeLastAvailable() []*ModelVersion { - if m == nil { // TODO Make safe by not working on actual object - return nil - } - lastAvailableIdx := m.getLastAvailableModelIdx() - if lastAvailableIdx != -1 { - return m.Versions[0:lastAvailableIdx] - } - return nil -} - -func (m *ModelSnapshot) GetVersionsBeforeLastModelGwAvailable() []*ModelVersion { - if m == nil { // TODO Make safe by not working on actual object - return nil - } - lastAvailableIdx := m.getLastModelGwAvailableModelIdx() - if lastAvailableIdx != -1 { - return m.Versions[0:lastAvailableIdx] - } - return nil -} - //go:generate go tool mockgen -source=./api.go -destination=./mock/store.go -package=mock ModelServerAPI type ModelServerAPI interface { UpdateModel(config *pb.LoadModelRequest) error @@ -159,6 +43,6 @@ type ModelServerAPI interface { FailedScheduling(modelName string, version uint32, reason string, reset bool) error GetAllModels() ([]string, error) SetModelGwModelState(modelName string, versionNumber uint32, status db.ModelState, reason string, source string) error - // TODO better name... should it even be on tihs interface? + // TODO better name... should it even be on this interface? EmitEvents() error } diff --git a/scheduler/pkg/store/experiment/db.go b/scheduler/pkg/store/experiment/db.go index 89ba4751b6..4899dff814 100644 --- a/scheduler/pkg/store/experiment/db.go +++ b/scheduler/pkg/store/experiment/db.go @@ -13,10 +13,11 @@ import ( "time" "github.com/dgraph-io/badger/v3" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/sirupsen/logrus" "google.golang.org/protobuf/proto" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/utils" ) diff --git a/scheduler/pkg/store/experiment/state.go b/scheduler/pkg/store/experiment/state.go index 9e282a3d80..547f808bb6 100644 --- a/scheduler/pkg/store/experiment/state.go +++ b/scheduler/pkg/store/experiment/state.go @@ -13,6 +13,7 @@ import ( "errors" "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" pipeline2 "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/pipeline" ) diff --git a/scheduler/pkg/store/experiment/state_test.go b/scheduler/pkg/store/experiment/state_test.go index 21e2a633a2..0c1421eea3 100644 --- a/scheduler/pkg/store/experiment/state_test.go +++ b/scheduler/pkg/store/experiment/state_test.go @@ -13,14 +13,14 @@ import ( "testing" . "github.com/onsi/gomega" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" - "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/mock" "github.com/sirupsen/logrus" "go.uber.org/mock/gomock" "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/mock" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/pipeline" ) diff --git a/scheduler/pkg/store/experiment/store.go b/scheduler/pkg/store/experiment/store.go index 621886e328..f8f399d6e8 100644 --- a/scheduler/pkg/store/experiment/store.go +++ b/scheduler/pkg/store/experiment/store.go @@ -17,9 +17,10 @@ import ( "time" "github.com/mitchellh/copystructure" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/sirupsen/logrus" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/pipeline" diff --git a/scheduler/pkg/store/experiment/store_test.go b/scheduler/pkg/store/experiment/store_test.go index cdf35907fc..17ba331d4c 100644 --- a/scheduler/pkg/store/experiment/store_test.go +++ b/scheduler/pkg/store/experiment/store_test.go @@ -17,9 +17,10 @@ import ( "github.com/google/go-cmp/cmp" . "github.com/onsi/gomega" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/sirupsen/logrus" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" ) diff --git a/scheduler/pkg/store/experiment/utils_test.go b/scheduler/pkg/store/experiment/utils_test.go index 39e39407ac..2ab074750a 100644 --- a/scheduler/pkg/store/experiment/utils_test.go +++ b/scheduler/pkg/store/experiment/utils_test.go @@ -13,6 +13,7 @@ import ( "testing" . "github.com/onsi/gomega" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" ) diff --git a/scheduler/pkg/store/in_memory_store.go b/scheduler/pkg/store/in_memory_store.go index f1be6e5c6e..d02611204d 100644 --- a/scheduler/pkg/store/in_memory_store.go +++ b/scheduler/pkg/store/in_memory_store.go @@ -13,8 +13,9 @@ import ( "context" "sync" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "google.golang.org/protobuf/proto" + + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" ) type StorageInMemory[T interface { diff --git a/scheduler/pkg/store/in_memory_store_test.go b/scheduler/pkg/store/in_memory_store_test.go new file mode 100644 index 0000000000..ac53ce5ce7 --- /dev/null +++ b/scheduler/pkg/store/in_memory_store_test.go @@ -0,0 +1,546 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed BY +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + +package store + +import ( + "context" + "sync" + "testing" + + . "github.com/onsi/gomega" + + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" +) + +func TestStorageInMemory_Get(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + setup func() *StorageInMemory[*db.Model] + id string + expectError error + expectName string + }{ + { + name: "get existing model", + setup: func() *StorageInMemory[*db.Model] { + storage := NewInMemoryStorage[*db.Model]() + _ = storage.Insert(context.Background(), &db.Model{Name: "model1"}) + return storage + }, + id: "model1", + expectError: nil, + expectName: "model1", + }, + { + name: "get non-existent model", + setup: func() *StorageInMemory[*db.Model] { + return NewInMemoryStorage[*db.Model]() + }, + id: "non-existent", + expectError: ErrNotFound, + expectName: "", + }, + { + name: "get from storage with multiple models", + setup: func() *StorageInMemory[*db.Model] { + storage := NewInMemoryStorage[*db.Model]() + _ = storage.Insert(context.Background(), &db.Model{Name: "model1"}) + _ = storage.Insert(context.Background(), &db.Model{Name: "model2"}) + _ = storage.Insert(context.Background(), &db.Model{Name: "model3"}) + return storage + }, + id: "model2", + expectError: nil, + expectName: "model2", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + storage := tt.setup() + result, err := storage.Get(context.Background(), tt.id) + + if tt.expectError != nil { + g.Expect(err).To(Equal(tt.expectError)) + } else { + g.Expect(err).To(BeNil()) + g.Expect(result.GetName()).To(Equal(tt.expectName)) + } + }) + } +} + +func TestStorageInMemory_Insert(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + setup func() *StorageInMemory[*db.Model] + record *db.Model + expectError error + }{ + { + name: "insert into empty storage", + setup: func() *StorageInMemory[*db.Model] { + return NewInMemoryStorage[*db.Model]() + }, + record: &db.Model{Name: "model1"}, + expectError: nil, + }, + { + name: "insert duplicate model", + setup: func() *StorageInMemory[*db.Model] { + storage := NewInMemoryStorage[*db.Model]() + _ = storage.Insert(context.Background(), &db.Model{Name: "model1"}) + return storage + }, + record: &db.Model{Name: "model1"}, + expectError: ErrAlreadyExists, + }, + { + name: "insert multiple different models", + setup: func() *StorageInMemory[*db.Model] { + storage := NewInMemoryStorage[*db.Model]() + _ = storage.Insert(context.Background(), &db.Model{Name: "model1"}) + return storage + }, + record: &db.Model{Name: "model2"}, + expectError: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + storage := tt.setup() + err := storage.Insert(context.Background(), tt.record) + + if tt.expectError != nil { + g.Expect(err).To(Equal(tt.expectError)) + } else { + g.Expect(err).To(BeNil()) + // Verify it was inserted + retrieved, getErr := storage.Get(context.Background(), tt.record.GetName()) + g.Expect(getErr).To(BeNil()) + g.Expect(retrieved.GetName()).To(Equal(tt.record.GetName())) + } + }) + } +} + +func TestStorageInMemory_List(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + setup func() *StorageInMemory[*db.Model] + expectedLen int + expectedNames []string + }{ + { + name: "list empty storage", + setup: func() *StorageInMemory[*db.Model] { + return NewInMemoryStorage[*db.Model]() + }, + expectedLen: 0, + expectedNames: []string{}, + }, + { + name: "list single model", + setup: func() *StorageInMemory[*db.Model] { + storage := NewInMemoryStorage[*db.Model]() + _ = storage.Insert(context.Background(), &db.Model{Name: "model1"}) + return storage + }, + expectedLen: 1, + expectedNames: []string{"model1"}, + }, + { + name: "list multiple models", + setup: func() *StorageInMemory[*db.Model] { + storage := NewInMemoryStorage[*db.Model]() + _ = storage.Insert(context.Background(), &db.Model{Name: "model1"}) + _ = storage.Insert(context.Background(), &db.Model{Name: "model2"}) + _ = storage.Insert(context.Background(), &db.Model{Name: "model3"}) + return storage + }, + expectedLen: 3, + expectedNames: []string{"model1", "model2", "model3"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + storage := tt.setup() + results, err := storage.List(context.Background()) + + g.Expect(err).To(BeNil()) + g.Expect(results).To(HaveLen(tt.expectedLen)) + + if tt.expectedLen > 0 { + names := make([]string, len(results)) + for i, r := range results { + names[i] = r.GetName() + } + g.Expect(names).To(ConsistOf(tt.expectedNames)) + } + }) + } +} + +func TestStorageInMemory_Update(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + setup func() *StorageInMemory[*db.Model] + record *db.Model + expectError error + }{ + { + name: "update existing model", + setup: func() *StorageInMemory[*db.Model] { + storage := NewInMemoryStorage[*db.Model]() + _ = storage.Insert(context.Background(), &db.Model{ + Name: "model1", + Versions: []*db.ModelVersion{{Version: 1}}, + }) + return storage + }, + record: &db.Model{ + Name: "model1", + Versions: []*db.ModelVersion{{Version: 2}}, + }, + expectError: nil, + }, + { + name: "update non-existent model", + setup: func() *StorageInMemory[*db.Model] { + return NewInMemoryStorage[*db.Model]() + }, + record: &db.Model{Name: "non-existent"}, + expectError: ErrNotFound, + }, + { + name: "update one of multiple models", + setup: func() *StorageInMemory[*db.Model] { + storage := NewInMemoryStorage[*db.Model]() + _ = storage.Insert(context.Background(), &db.Model{Name: "model1"}) + _ = storage.Insert(context.Background(), &db.Model{Name: "model2"}) + _ = storage.Insert(context.Background(), &db.Model{Name: "model3"}) + return storage + }, + record: &db.Model{Name: "model2", Deleted: true}, + expectError: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + storage := tt.setup() + err := storage.Update(context.Background(), tt.record) + + if tt.expectError != nil { + g.Expect(err).To(Equal(tt.expectError)) + } else { + g.Expect(err).To(BeNil()) + // Verify the update + retrieved, getErr := storage.Get(context.Background(), tt.record.GetName()) + g.Expect(getErr).To(BeNil()) + g.Expect(retrieved.GetName()).To(Equal(tt.record.GetName())) + if len(tt.record.Versions) > 0 { + g.Expect(retrieved.Versions).To(HaveLen(len(tt.record.Versions))) + g.Expect(retrieved.Versions[0].Version).To(Equal(tt.record.Versions[0].Version)) + } + } + }) + } +} + +func TestStorageInMemory_Delete(t *testing.T) { + g := NewWithT(t) + + tests := []struct { + name string + setup func() *StorageInMemory[*db.Model] + id string + expectError error + remainingCount int + }{ + { + name: "delete existing model", + setup: func() *StorageInMemory[*db.Model] { + storage := NewInMemoryStorage[*db.Model]() + _ = storage.Insert(context.Background(), &db.Model{Name: "model1"}) + return storage + }, + id: "model1", + expectError: nil, + remainingCount: 0, + }, + { + name: "delete non-existent model", + setup: func() *StorageInMemory[*db.Model] { + return NewInMemoryStorage[*db.Model]() + }, + id: "non-existent", + expectError: ErrNotFound, + remainingCount: 0, + }, + { + name: "delete one of multiple models", + setup: func() *StorageInMemory[*db.Model] { + storage := NewInMemoryStorage[*db.Model]() + _ = storage.Insert(context.Background(), &db.Model{Name: "model1"}) + _ = storage.Insert(context.Background(), &db.Model{Name: "model2"}) + _ = storage.Insert(context.Background(), &db.Model{Name: "model3"}) + return storage + }, + id: "model2", + expectError: nil, + remainingCount: 2, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + storage := tt.setup() + err := storage.Delete(context.Background(), tt.id) + + if tt.expectError != nil { + g.Expect(err).To(Equal(tt.expectError)) + } else { + g.Expect(err).To(BeNil()) + // Verify deletion + _, getErr := storage.Get(context.Background(), tt.id) + g.Expect(getErr).To(Equal(ErrNotFound)) + + // Check remaining count + list, _ := storage.List(context.Background()) + g.Expect(list).To(HaveLen(tt.remainingCount)) + } + }) + } +} + +func TestStorageInMemory_ServerType(t *testing.T) { + g := NewWithT(t) + + t.Run("operations with Server type", func(t *testing.T) { + storage := NewInMemoryStorage[*db.Server]() + + // Insert + server1 := &db.Server{Name: "server1", Shared: true} + err := storage.Insert(context.Background(), server1) + g.Expect(err).To(BeNil()) + + // Get + retrieved, err := storage.Get(context.Background(), "server1") + g.Expect(err).To(BeNil()) + g.Expect(retrieved.GetName()).To(Equal("server1")) + g.Expect(retrieved.Shared).To(Equal(true)) + + // Update + server1.Shared = false + err = storage.Update(context.Background(), server1) + g.Expect(err).To(BeNil()) + + retrieved, err = storage.Get(context.Background(), "server1") + g.Expect(err).To(BeNil()) + g.Expect(retrieved.Shared).To(Equal(false)) + + // Delete + err = storage.Delete(context.Background(), "server1") + g.Expect(err).To(BeNil()) + + _, err = storage.Get(context.Background(), "server1") + g.Expect(err).To(Equal(ErrNotFound)) + }) +} + +func TestStorageInMemory_Cloning(t *testing.T) { + g := NewWithT(t) + + t.Run("modifications to returned record don't affect storage", func(t *testing.T) { + storage := NewInMemoryStorage[*db.Model]() + original := &db.Model{ + Name: "model1", + Versions: []*db.ModelVersion{{Version: 1}}, + } + err := storage.Insert(context.Background(), original) + g.Expect(err).To(BeNil()) + + // Get the model + retrieved, err := storage.Get(context.Background(), "model1") + g.Expect(err).To(BeNil()) + + // Modify the retrieved model + retrieved.Deleted = true + retrieved.Versions = append(retrieved.Versions, &db.ModelVersion{Version: 2}) + + // Get again and verify storage wasn't affected + retrievedAgain, err := storage.Get(context.Background(), "model1") + g.Expect(err).To(BeNil()) + g.Expect(retrievedAgain.Deleted).To(BeFalse()) + g.Expect(retrievedAgain.Versions).To(HaveLen(1)) + g.Expect(retrievedAgain.Versions[0].Version).To(Equal(uint32(1))) + }) + + t.Run("modifications to inserted record don't affect storage", func(t *testing.T) { + storage := NewInMemoryStorage[*db.Model]() + model := &db.Model{ + Name: "model1", + Versions: []*db.ModelVersion{{Version: 1}}, + } + err := storage.Insert(context.Background(), model) + g.Expect(err).To(BeNil()) + + // Modify the original model + model.Deleted = true + model.Versions = append(model.Versions, &db.ModelVersion{Version: 2}) + + // Get from storage and verify it wasn't affected + retrieved, err := storage.Get(context.Background(), "model1") + g.Expect(err).To(BeNil()) + g.Expect(retrieved.Deleted).To(BeFalse()) + g.Expect(retrieved.Versions).To(HaveLen(1)) + g.Expect(retrieved.Versions[0].Version).To(Equal(uint32(1))) + }) +} + +func TestStorageInMemory_ConcurrentOperations(t *testing.T) { + g := NewWithT(t) + + t.Run("concurrent inserts", func(t *testing.T) { + storage := NewInMemoryStorage[*db.Model]() + var wg sync.WaitGroup + numGoroutines := 10 + + for i := 0; i < numGoroutines; i++ { + wg.Add(1) + go func(idx int) { + defer wg.Done() + model := &db.Model{Name: "model" + string(rune('0'+idx))} + _ = storage.Insert(context.Background(), model) + }(i) + } + + wg.Wait() + + list, err := storage.List(context.Background()) + g.Expect(err).To(BeNil()) + g.Expect(len(list)).To(BeNumerically("<=", numGoroutines)) + }) + + t.Run("concurrent reads and writes", func(t *testing.T) { + storage := NewInMemoryStorage[*db.Model]() + _ = storage.Insert(context.Background(), &db.Model{Name: "model1"}) + + var wg sync.WaitGroup + numReaders := 5 + numWriters := 5 + + // Concurrent readers + for i := 0; i < numReaders; i++ { + wg.Add(1) + go func() { + defer wg.Done() + _, _ = storage.Get(context.Background(), "model1") + }() + } + + // Concurrent writers + for i := 0; i < numWriters; i++ { + wg.Add(1) + go func() { + defer wg.Done() + _ = storage.Update(context.Background(), &db.Model{Name: "model1"}) + }() + } + + wg.Wait() + + // Verify storage is still consistent + retrieved, err := storage.Get(context.Background(), "model1") + g.Expect(err).To(BeNil()) + g.Expect(retrieved.GetName()).To(Equal("model1")) + }) + + t.Run("concurrent list operations", func(t *testing.T) { + storage := NewInMemoryStorage[*db.Model]() + for i := 0; i < 5; i++ { + _ = storage.Insert(context.Background(), &db.Model{Name: "model" + string(rune('0'+i))}) + } + + var wg sync.WaitGroup + numGoroutines := 10 + + for i := 0; i < numGoroutines; i++ { + wg.Add(1) + go func() { + defer wg.Done() + list, err := storage.List(context.Background()) + g.Expect(err).To(BeNil()) + g.Expect(len(list)).To(BeNumerically(">=", 0)) + }() + } + + wg.Wait() + }) +} + +func TestStorageInMemory_Integration(t *testing.T) { + g := NewWithT(t) + + t.Run("full lifecycle", func(t *testing.T) { + storage := NewInMemoryStorage[*db.Model]() + + // Insert multiple models + for i := 1; i <= 3; i++ { + model := &db.Model{ + Name: "model" + string(rune('0'+i)), + Versions: []*db.ModelVersion{{Version: uint32(i)}}, + } + err := storage.Insert(context.Background(), model) + g.Expect(err).To(BeNil()) + } + + // List all + list, err := storage.List(context.Background()) + g.Expect(err).To(BeNil()) + g.Expect(list).To(HaveLen(3)) + + // Update one + model2, err := storage.Get(context.Background(), "model2") + g.Expect(err).To(BeNil()) + model2.Deleted = true + err = storage.Update(context.Background(), model2) + g.Expect(err).To(BeNil()) + + // Verify update + updated, err := storage.Get(context.Background(), "model2") + g.Expect(err).To(BeNil()) + g.Expect(updated.Deleted).To(BeTrue()) + + // Delete one + err = storage.Delete(context.Background(), "model1") + g.Expect(err).To(BeNil()) + + // Verify deletion + list, err = storage.List(context.Background()) + g.Expect(err).To(BeNil()) + g.Expect(list).To(HaveLen(2)) + + // Try to get deleted model + _, err = storage.Get(context.Background(), "model1") + g.Expect(err).To(Equal(ErrNotFound)) + }) +} diff --git a/scheduler/pkg/store/memory.go b/scheduler/pkg/store/memory.go index cff9ebb414..f04ad20c4e 100644 --- a/scheduler/pkg/store/memory.go +++ b/scheduler/pkg/store/memory.go @@ -16,11 +16,11 @@ import ( "sort" "sync" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" log "github.com/sirupsen/logrus" "github.com/seldonio/seldon-core/apis/go/v2/mlops/agent" pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/utils" @@ -194,19 +194,6 @@ func (m *ModelServerStore) UpdateModel(req *pb.LoadModelRequest) error { return nil } -func (m *ModelServerStore) deepCopy(model *Model, key string) *ModelSnapshot { - snapshot := &ModelSnapshot{ - Name: key, - Deleted: model.IsDeleted(), - } - - snapshot.Versions = make([]*ModelVersion, len(model.versions)) - for i, version := range model.versions { - snapshot.Versions[i] = version.DeepCopy() - } - return snapshot -} - func (m *ModelServerStore) LockServer(serverId string) { var lock sync.RWMutex existingLock, _ := m.opLocks.LoadOrStore(serverLockID(serverId), &lock) diff --git a/scheduler/pkg/store/memory_status.go b/scheduler/pkg/store/memory_status.go index cdfc6b4d09..a88615bc00 100644 --- a/scheduler/pkg/store/memory_status.go +++ b/scheduler/pkg/store/memory_status.go @@ -14,9 +14,11 @@ import ( "fmt" "time" + "google.golang.org/protobuf/types/known/timestamppb" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" - "google.golang.org/protobuf/types/known/timestamppb" ) const ( diff --git a/scheduler/pkg/store/memory_test.go b/scheduler/pkg/store/memory_test.go index 644755313a..e238a4e2f9 100644 --- a/scheduler/pkg/store/memory_test.go +++ b/scheduler/pkg/store/memory_test.go @@ -18,11 +18,11 @@ import ( "time" . "github.com/onsi/gomega" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/agent" log "github.com/sirupsen/logrus" "google.golang.org/protobuf/proto" "google.golang.org/protobuf/types/known/timestamppb" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/agent" pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" diff --git a/scheduler/pkg/store/mesh.go b/scheduler/pkg/store/mesh.go index fbf9dcb87f..f606e5aa21 100644 --- a/scheduler/pkg/store/mesh.go +++ b/scheduler/pkg/store/mesh.go @@ -10,150 +10,13 @@ the Change License after the Change Date as each is defined in accordance with t package store import ( - "fmt" "strings" - "sync" - "sync/atomic" - "time" - - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" - "google.golang.org/protobuf/proto" pba "github.com/seldonio/seldon-core/apis/go/v2/mlops/agent" pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" ) -type LocalSchedulerStore struct { - servers map[string]*Server - models map[string]*Model - failedToScheduleModels map[string]bool -} - -func NewLocalSchedulerStore() *LocalSchedulerStore { - m := LocalSchedulerStore{} - m.servers = make(map[string]*Server) - m.models = make(map[string]*Model) - m.failedToScheduleModels = make(map[string]bool) - return &m -} - -type Model struct { - name string - versions []*ModelVersion - deleted atomic.Bool -} - -type ModelVersionID struct { - Name string - Version uint32 -} - -func (mv *ModelVersionID) String() string { - return fmt.Sprintf("%s:%d", mv.Name, mv.Version) -} - -type ModelVersion struct { - modelDefn *pb.Model - version uint32 - serverMu sync.RWMutex - server string - replicas map[int]ReplicaStatus - state ModelStatus - mu sync.RWMutex -} - -type ModelStatus struct { - State ModelState - ModelGwState ModelState - Reason string - ModelGwReason string - AvailableReplicas uint32 - UnavailableReplicas uint32 - DrainingReplicas uint32 - Timestamp time.Time -} - -type ReplicaStatus struct { - State ModelReplicaState - Reason string - Timestamp time.Time -} - -func NewDefaultModelVersion(model *pb.Model, version uint32) *db.ModelVersion { - return &db.ModelVersion{ - Version: version, - ModelDefn: model, - Replicas: make(map[int32]*db.ReplicaStatus, 0), - State: &db.ModelStatus{ - State: db.ModelState_ModelStateUnknown, - ModelGwState: db.ModelState_ModelCreate, - }, - } -} - -// TODO: remove deleted from here and reflect in callers -// This is only used in tests, thus we don't need to worry about modelGWState -func NewModelVersion(model *pb.Model, version uint32, server string, replicas map[int]ReplicaStatus, deleted bool, state ModelState) *ModelVersion { - return &ModelVersion{ - version: version, - modelDefn: model, - server: server, - replicas: replicas, - state: ModelStatus{State: state}, - mu: sync.RWMutex{}, - } -} - -type Server struct { - name string - replicas map[int]*ServerReplica - shared bool - expectedReplicas int - minReplicas int - maxReplicas int - kubernetesMeta *pb.KubernetesMeta -} - -func (s *Server) CreateSnapshot(shallow bool, modelDetails bool) *ServerSnapshot { - // TODO: this is considered interface leakage if we do shallow copy by allowing - // callers to access and change this structure - // perhaps we consider returning back only what callers need - var replicas map[int]*ServerReplica - if !shallow { - replicas = make(map[int]*ServerReplica, len(s.replicas)) - for k, v := range s.replicas { - replicas[k] = v.createSnapshot(modelDetails) - } - } else { - replicas = s.replicas - } - return &ServerSnapshot{ - Name: s.name, - Replicas: replicas, - Shared: s.shared, - ExpectedReplicas: s.expectedReplicas, - MinReplicas: s.minReplicas, - MaxReplicas: s.maxReplicas, - KubernetesMeta: proto.Clone(s.kubernetesMeta).(*pb.KubernetesMeta), - } -} - -func (s *Server) SetExpectedReplicas(replicas int) { - s.expectedReplicas = replicas -} - -func (s *Server) SetMinReplicas(replicas int) { - s.minReplicas = replicas -} - -func (s *Server) SetMaxReplicas(replicas int) { - s.maxReplicas = replicas -} - -func (s *Server) SetKubernetesMeta(meta *pb.KubernetesMeta) { - s.kubernetesMeta = meta -} - func NewServer(name string, shared bool) *db.Server { return &db.Server{ Name: name, @@ -163,60 +26,6 @@ func NewServer(name string, shared bool) *db.Server { } } -type ServerReplica struct { - muReservedMemory sync.RWMutex - muLoadedModels sync.RWMutex - muDrainingState sync.RWMutex - inferenceSvc string - inferenceHttpPort int32 - inferenceGrpcPort int32 - serverName string - replicaIdx int - capabilities []string - memory uint64 - availableMemory uint64 - // precomputed values to speed up ops on scheduler - loadedModels map[ModelVersionID]bool - // for marking models that are in process of load requested or loading on this server (to speed up ops) - loadingModels map[ModelVersionID]bool - overCommitPercentage uint32 - // holding reserved memory on server replica while loading models, internal to scheduler - reservedMemory uint64 - // precomputed values to speed up ops on scheduler - uniqueLoadedModels map[string]bool - isDraining bool -} - -func NewServerReplica(inferenceSvc string, - inferenceHttpPort int32, - inferenceGrpcPort int32, - replicaIdx int, - server *db.Server, - capabilities []string, - memory, - availableMemory, - reservedMemory uint64, - loadedModels map[ModelVersionID]bool, - overCommitPercentage uint32, -) *ServerReplica { - return &ServerReplica{ - inferenceSvc: inferenceSvc, - inferenceHttpPort: inferenceHttpPort, - inferenceGrpcPort: inferenceGrpcPort, - serverName: server.Name, - replicaIdx: replicaIdx, - capabilities: cleanCapabilities(capabilities), - memory: memory, - availableMemory: availableMemory, - reservedMemory: reservedMemory, - loadedModels: loadedModels, - loadingModels: map[ModelVersionID]bool{}, - overCommitPercentage: overCommitPercentage, - //uniqueLoadedModels: toUniqueModels(loadedModels), - isDraining: false, - } -} - func NewServerReplicaFromConfig(server *db.Server, replicaIdx int, loadedModels []*db.ModelVersionID, config *pba.ReplicaConfig, availableMemoryBytes uint64) *db.ServerReplica { return &db.ServerReplica{ InferenceSvc: config.GetInferenceSvc(), @@ -235,6 +44,18 @@ func NewServerReplicaFromConfig(server *db.Server, replicaIdx int, loadedModels } } +func NewDefaultModelVersion(model *pb.Model, version uint32) *db.ModelVersion { + return &db.ModelVersion{ + Version: version, + ModelDefn: model, + Replicas: make(map[int32]*db.ReplicaStatus, 0), + State: &db.ModelStatus{ + State: db.ModelState_ModelStateUnknown, + ModelGwState: db.ModelState_ModelCreate, + }, + } +} + func cleanCapabilities(capabilities []string) []string { var cleaned []string for _, capability := range capabilities { @@ -243,579 +64,6 @@ func cleanCapabilities(capabilities []string) []string { return cleaned } -type ModelState uint32 - -//go:generate go tool stringer -type=ModelState -const ( - ModelStateUnknown ModelState = iota - ModelProgressing - ModelAvailable - ModelFailed - ModelTerminating - ModelTerminated - ModelTerminateFailed - ScheduleFailed - ModelScaledDown - ModelCreate - ModelTerminate -) - -type ModelReplicaState uint32 - -//go:generate go tool stringer -type=ModelReplicaState -const ( - ModelReplicaStateUnknown ModelReplicaState = iota - LoadRequested - Loading - Loaded - LoadFailed - UnloadEnvoyRequested - UnloadRequested - Unloading - Unloaded - UnloadFailed - Available - LoadedUnavailable - Draining -) - -var replicaStates = []ModelReplicaState{ - ModelReplicaStateUnknown, - LoadRequested, - Loading, - Loaded, - LoadFailed, - UnloadEnvoyRequested, - UnloadRequested, - Unloading, - Unloaded, - UnloadFailed, - Available, - LoadedUnavailable, - Draining, -} - -// LoadedUnavailable is included as we can try to move state to Available via an Envoy update -func (m ModelReplicaState) CanReceiveTraffic() bool { - return (m == Loaded || m == Available || m == LoadedUnavailable || m == Draining) -} - -func (m ModelReplicaState) AlreadyLoadingOrLoaded() bool { - return (m == Loading || m == Loaded || m == Available || m == LoadedUnavailable) -} - -func (m ModelReplicaState) UnloadingOrUnloaded() bool { - return (m == UnloadEnvoyRequested || m == UnloadRequested || m == Unloading || m == Unloaded || m == ModelReplicaStateUnknown) -} - -func (m ModelReplicaState) Inactive() bool { - return (m == Unloaded || m == UnloadFailed || m == ModelReplicaStateUnknown || m == LoadFailed) -} - -func (m ModelReplicaState) IsLoadingOrLoaded() bool { - return (m == Loaded || m == LoadRequested || m == Loading || m == Available || m == LoadedUnavailable) -} - -func (m *Model) Name() string { - return m.name -} - -func (m *Model) HasLatest() bool { - return len(m.versions) > 0 -} - -func (m *Model) Latest() *ModelVersion { - if len(m.versions) > 0 { - return m.versions[len(m.versions)-1] - } else { - return nil - } -} - -func (m *Model) GetVersion(version uint32) *ModelVersion { - for _, mv := range m.versions { - if mv.GetVersion() == version { - return mv - } - } - return nil -} - -func (m *Model) GetVersions() []uint32 { - versions := make([]uint32, len(m.versions)) - for idx, v := range m.versions { - versions[idx] = v.version - } - return versions -} - -func (m *Model) getLastAvailableModelVersionIdx() int { - lastAvailableIdx := -1 - for idx, mv := range m.versions { - if mv.state.State == ModelAvailable { - lastAvailableIdx = idx - } - } - return lastAvailableIdx -} - -func (m *Model) GetLastAvailableModelVersion() *ModelVersion { - lastAvailableIdx := m.getLastAvailableModelVersionIdx() - if lastAvailableIdx != -1 { - return m.versions[lastAvailableIdx] - } - return nil -} - -func (m *Model) Previous() *ModelVersion { - if len(m.versions) > 1 { - return m.versions[len(m.versions)-2] - } else { - return nil - } -} - -// TODO do we need to consider previous versions? -func (m *Model) Inactive() bool { - return m.Latest().Inactive() -} - -func (m *Model) IsDeleted() bool { - return m.deleted.Load() -} - -func (m *Model) SetDeleted() { - m.deleted.Store(true) -} - -func (m *ModelVersion) DeepCopy() *ModelVersion { - m.mu.RLock() - defer m.mu.RUnlock() - - state := m.state - - newMV := ModelVersion{ - version: m.version, - server: m.server, - state: state, - } - - if m.modelDefn != nil { - newMV.modelDefn = proto.Clone(m.modelDefn).(*pb.Model) - } - - if m.replicas != nil { - newMV.replicas = make(map[int]ReplicaStatus, len(m.replicas)) - for k, v := range m.replicas { - newMV.replicas[k] = v - } - } - - return &newMV -} - -func (m *ModelVersion) GetVersion() uint32 { - return m.version -} - -func (m *ModelVersion) GetRequiredMemory() uint64 { - var multiplier uint64 = 1 - if m.GetModelSpec() != nil && - m.GetModelSpec().ModelRuntimeInfo != nil && - m.GetModelSpec().ModelRuntimeInfo.ModelRuntimeInfo != nil { - multiplier = getInstanceCount(m.GetModelSpec().ModelRuntimeInfo) - } - return m.modelDefn.GetModelSpec().GetMemoryBytes() * multiplier -} - -func getInstanceCount(modelRuntimeInfo *pb.ModelRuntimeInfo) uint64 { - switch modelRuntimeInfo.ModelRuntimeInfo.(type) { - case *pb.ModelRuntimeInfo_Mlserver: - return uint64(modelRuntimeInfo.GetMlserver().ParallelWorkers) - case *pb.ModelRuntimeInfo_Triton: - return uint64(modelRuntimeInfo.GetTriton().Cpu[0].InstanceCount) - default: - return 1 - } -} - -func (m *ModelVersion) GetRequirements() []string { - return m.modelDefn.GetModelSpec().GetRequirements() -} - -func (m *ModelVersion) DesiredReplicas() int { - return int(m.modelDefn.GetDeploymentSpec().GetReplicas()) -} - -func (m *ModelVersion) GetModel() *pb.Model { - return proto.Clone(m.modelDefn).(*pb.Model) -} - -func (m *ModelVersion) GetMeta() *pb.MetaData { - return proto.Clone(m.modelDefn.GetMeta()).(*pb.MetaData) -} - -func (m *ModelVersion) GetModelSpec() *pb.ModelSpec { - return proto.Clone(m.modelDefn.GetModelSpec()).(*pb.ModelSpec) -} - -func (m *ModelVersion) GetDeploymentSpec() *pb.DeploymentSpec { - return proto.Clone(m.modelDefn.GetDeploymentSpec()).(*pb.DeploymentSpec) -} - -func (m *ModelVersion) SetDeploymentSpec(spec *pb.DeploymentSpec) { - m.modelDefn.DeploymentSpec = spec -} - -func (m *ModelVersion) SetServer(srv string) { - m.serverMu.Lock() - defer m.serverMu.Unlock() - m.server = srv -} - -func (m *ModelVersion) Server() string { - m.serverMu.RLock() - defer m.serverMu.RUnlock() - return m.server -} - -func (m *ModelVersion) ReplicaState() map[int]ReplicaStatus { - m.mu.RLock() - defer m.mu.RUnlock() - copy := make(map[int]ReplicaStatus, len(m.replicas)) - for idx, r := range m.replicas { - copy[idx] = r - } - return copy -} - -func (m *ModelVersion) ModelState() ModelStatus { - return m.state -} - -// note: this is used for testing purposes and should not be called directly in production -func (m *ModelVersion) SetModelState(s ModelStatus) { - m.state = s -} - -func (m *ModelVersion) GetModelReplicaState(replicaIdx int) ModelReplicaState { - m.mu.RLock() - defer m.mu.RUnlock() - state, ok := m.replicas[replicaIdx] - if !ok { - return ModelReplicaStateUnknown - } - return state.State -} - -func (m *ModelVersion) UpdateKubernetesMeta(meta *pb.KubernetesMeta) { - m.modelDefn.Meta.KubernetesMeta = meta -} - -func (m *ModelVersion) GetReplicaForState(state ModelReplicaState) []int { - m.mu.RLock() - defer m.mu.RUnlock() - var assignment []int - for k, v := range m.replicas { - if v.State == state { - assignment = append(assignment, k) - } - } - return assignment -} - -func (m *ModelVersion) GetRequestedServer() *string { - return m.modelDefn.GetModelSpec().Server -} - -func (m *ModelVersion) HasServer() bool { - return m.Server() != "" -} - -func (m *ModelVersion) Inactive() bool { - m.mu.RLock() - defer m.mu.RUnlock() - for _, v := range m.replicas { - if !v.State.Inactive() { - return false - } - } - return true -} - -func (m *ModelVersion) IsLoadingOrLoaded(server string, replicaIdx int) bool { - if server != m.Server() { - return false - } - m.mu.RLock() - defer m.mu.RUnlock() - for r, v := range m.replicas { - if r == replicaIdx && v.State.IsLoadingOrLoaded() { - return true - } - } - return false -} - -func (m *ModelVersion) IsLoadingOrLoadedOnServer() bool { - m.mu.RLock() - defer m.mu.RUnlock() - for _, v := range m.replicas { - if v.State.AlreadyLoadingOrLoaded() { - return true - } - } - return false -} - -func (m *ModelVersion) HasLiveReplicas() bool { - m.mu.RLock() - defer m.mu.RUnlock() - for _, v := range m.replicas { - if v.State.CanReceiveTraffic() { - return true - } - } - return false -} - -func (m *ModelVersion) GetAssignment() []int { - m.mu.RLock() - defer m.mu.RUnlock() - - var assignment []int - var draining []int - for k, v := range m.replicas { - if v.State == Loaded || v.State == Available || v.State == LoadedUnavailable { - assignment = append(assignment, k) - } - if v.State == Draining { - draining = append(draining, k) - } - } - // prefer assignments that are not draining as envoy is eventual consistent - if len(assignment) > 0 { - return assignment - } else if len(draining) > 0 { - return draining - } - return nil -} - -func (m *ModelVersion) Key() string { - return m.modelDefn.GetMeta().GetName() -} - -func (m *ModelVersion) SetReplicaState(replicaIdx int, state ModelReplicaState, reason string) { - m.mu.Lock() - defer m.mu.Unlock() - m.replicas[replicaIdx] = ReplicaStatus{State: state, Timestamp: time.Now(), Reason: reason} -} - -func (m *ModelVersion) DeleteReplica(replicaIdx int) { - m.mu.Lock() - defer m.mu.Unlock() - delete(m.replicas, replicaIdx) -} - -func (m *ModelVersion) UpdateRuntimeInfo(runtimeInfo *pb.ModelRuntimeInfo) { - m.mu.Lock() - defer m.mu.Unlock() - if m.modelDefn.ModelSpec != nil && m.modelDefn.ModelSpec.ModelRuntimeInfo == nil && runtimeInfo != nil { - m.modelDefn.ModelSpec.ModelRuntimeInfo = runtimeInfo - } -} - -func (s *Server) Key() string { - return s.name -} - -func (s *Server) NumReplicas() uint32 { - return uint32(len(s.replicas)) -} - -func (s *Server) GetAvailableMemory(idx int) uint64 { - if s != nil && idx < len(s.replicas) { - return s.replicas[idx].availableMemory - } - return 0 -} - -func (s *Server) GetMemory(idx int) uint64 { - if s != nil && idx < len(s.replicas) { - return s.replicas[idx].memory - } - return 0 -} - -func (s *Server) GetReplicaInferenceSvc(idx int) string { - return s.replicas[idx].inferenceSvc -} - -func (s *Server) GetReplicaInferenceHttpPort(idx int) int32 { - return s.replicas[idx].inferenceHttpPort -} - -func (s *ServerReplica) createSnapshot(modelDetails bool) *ServerReplica { - capabilities := make([]string, len(s.capabilities)) - copy(capabilities, s.capabilities) - - var loadedModels map[ModelVersionID]bool - var loadingModels map[ModelVersionID]bool - var uniqueLoadedModels map[string]bool - if modelDetails { - loadedModels = make(map[ModelVersionID]bool, len(s.loadedModels)) - for k, v := range s.loadedModels { - loadedModels[k] = v - } - loadingModels = make(map[ModelVersionID]bool, len(s.loadingModels)) - for k, v := range s.loadingModels { - loadingModels[k] = v - } - uniqueLoadedModels = make(map[string]bool, len(s.loadedModels)) - for k, v := range s.uniqueLoadedModels { - uniqueLoadedModels[k] = v - } - } - - return &ServerReplica{ - inferenceSvc: s.inferenceSvc, - inferenceHttpPort: s.inferenceHttpPort, - inferenceGrpcPort: s.inferenceGrpcPort, - serverName: s.serverName, - replicaIdx: s.replicaIdx, - capabilities: capabilities, - memory: s.memory, - availableMemory: s.availableMemory, - loadedModels: loadedModels, - loadingModels: loadingModels, - overCommitPercentage: s.overCommitPercentage, - reservedMemory: s.reservedMemory, - uniqueLoadedModels: uniqueLoadedModels, - isDraining: s.GetIsDraining(), - } -} - -func (s *ServerReplica) GetLoadedOrLoadingModelVersions() []ModelVersionID { - s.muLoadedModels.RLock() - defer s.muLoadedModels.RUnlock() - - var models []ModelVersionID - for model := range s.loadedModels { - models = append(models, model) - } - for model := range s.loadingModels { - models = append(models, model) - } - return models -} - -func (s *ServerReplica) GetNumLoadedModels() int { - s.muLoadedModels.RLock() - defer s.muLoadedModels.RUnlock() - - return len(s.uniqueLoadedModels) -} - -func (s *ServerReplica) GetAvailableMemory() uint64 { - return s.availableMemory -} - -func (s *ServerReplica) GetMemory() uint64 { - return s.memory -} - -func (s *ServerReplica) GetCapabilities() []string { - return s.capabilities -} - -func (s *ServerReplica) GetServerName() string { - return s.serverName -} - -func (s *ServerReplica) GetReplicaIdx() int { - return s.replicaIdx -} - -func (s *ServerReplica) GetInferenceSvc() string { - return s.inferenceSvc -} - -func (s *ServerReplica) GetInferenceHttpPort() int32 { - return s.inferenceHttpPort -} - -func (s *ServerReplica) GetInferenceGrpcPort() int32 { - return s.inferenceGrpcPort -} - -func (s *ServerReplica) GetOverCommitPercentage() uint32 { - return s.overCommitPercentage -} - -func (s *ServerReplica) GetReservedMemory() uint64 { - s.muReservedMemory.RLock() - defer s.muReservedMemory.RUnlock() - - return s.reservedMemory -} - -func (s *ServerReplica) GetIsDraining() bool { - s.muDrainingState.RLock() - defer s.muDrainingState.RUnlock() - - return s.isDraining -} - -func (s *ServerReplica) SetIsDraining() { - s.muDrainingState.Lock() - defer s.muDrainingState.Unlock() - - s.isDraining = true -} - -func (s *ServerReplica) UpdateReservedMemory(memBytes uint64, isAdd bool) { - s.muReservedMemory.Lock() - defer s.muReservedMemory.Unlock() - - if isAdd { - s.reservedMemory += memBytes - } else { - if memBytes > s.reservedMemory { - s.reservedMemory = 0 - } else { - s.reservedMemory -= memBytes - } - } -} - -func (s *ServerReplica) addModelVersion(modelName string, modelVersion uint32, replicaState ModelReplicaState) { - s.muLoadedModels.Lock() - defer s.muLoadedModels.Unlock() - - id := ModelVersionID{Name: modelName, Version: modelVersion} - if replicaState == Loading { - s.loadingModels[id] = true - } else if replicaState == Loaded { - delete(s.loadingModels, id) - - s.loadedModels[id] = true - s.uniqueLoadedModels[modelName] = true - } -} - -func (s *ServerReplica) deleteModelVersion(modelName string, modelVersion uint32) { - s.muLoadedModels.Lock() - defer s.muLoadedModels.Unlock() - - id := ModelVersionID{Name: modelName, Version: modelVersion} - delete(s.loadingModels, id) - delete(s.loadedModels, id) - if !modelExists(s.loadedModels, modelName) { - delete(s.uniqueLoadedModels, modelName) - } -} - func toUniqueModels(loadedModels []*db.ModelVersionID) map[string]bool { uniqueModels := make(map[string]bool) for _, key := range loadedModels { @@ -823,14 +71,3 @@ func toUniqueModels(loadedModels []*db.ModelVersionID) map[string]bool { } return uniqueModels } - -func modelExists(loadedModels map[ModelVersionID]bool, modelKey string) bool { - found := false - for key := range loadedModels { - if key.Name == modelKey { - found = true - break - } - } - return found -} diff --git a/scheduler/pkg/store/mesh_test.go b/scheduler/pkg/store/mesh_test.go index fff54134db..47f8480383 100644 --- a/scheduler/pkg/store/mesh_test.go +++ b/scheduler/pkg/store/mesh_test.go @@ -11,187 +11,10 @@ package store import ( "testing" - "time" . "github.com/onsi/gomega" - "google.golang.org/protobuf/proto" - "knative.dev/pkg/ptr" - - pb "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" ) -func TestModelVersion_DeepCopy(t *testing.T) { - uInt32Ptr := func(i uint32) *uint32 { - return &i - } - uInt64Ptr := func(i uint64) *uint64 { - return &i - } - - tests := []struct { - name string - setupMV func() *ModelVersion - validate func(t *testing.T, original, copied *ModelVersion) - }{ - { - name: "empty model version", - setupMV: func() *ModelVersion { - return &ModelVersion{} - }, - validate: func(t *testing.T, original, copied *ModelVersion) { - RegisterTestingT(t) - Expect(copied).ToNot(BeNil()) - Expect(copied.version).To(Equal(uint32(0))) - Expect(copied.server).To(Equal("")) - Expect(copied.state).To(Equal(ModelStatus{})) - Expect(copied.modelDefn).To(BeNil()) - Expect(copied.replicas).To(BeNil()) - }, - }, - { - name: "basic fields only", - setupMV: func() *ModelVersion { - return &ModelVersion{ - version: 123, - server: "test-server", - state: ModelStatus{ - State: ModelAvailable, - Reason: "some reason", - AvailableReplicas: 1, - UnavailableReplicas: 2, - DrainingReplicas: 3, - Timestamp: time.Now(), - }, - } - }, - validate: func(t *testing.T, original, copied *ModelVersion) { - RegisterTestingT(t) - Expect(copied.version).To(Equal(uint32(123))) - Expect(copied.server).To(Equal("test-server")) - Expect(copied.state).To(Equal(original.state)) - Expect(copied.modelDefn).To(BeNil()) - Expect(copied.replicas).To(BeNil()) - }, - }, - { - name: "with model definition", - setupMV: func() *ModelVersion { - return &ModelVersion{ - version: 456, - server: "model-server", - replicas: map[int]ReplicaStatus{ - 1: { - State: Available, - Reason: "some reason", - Timestamp: time.Now(), - }, - 2: { - State: LoadedUnavailable, - Reason: "some other reason", - Timestamp: time.Now(), - }, - }, - state: ModelStatus{ - State: ModelAvailable, - Reason: "some reason", - AvailableReplicas: 1, - UnavailableReplicas: 2, - DrainingReplicas: 3, - Timestamp: time.Now(), - }, - modelDefn: &pb.Model{ - Meta: &pb.MetaData{ - Name: "some name", - Kind: ptr.String("some kind"), - Version: ptr.String("some version"), - KubernetesMeta: &pb.KubernetesMeta{ - Namespace: "some namespace", - Generation: 1, - }, - }, - ModelSpec: &pb.ModelSpec{ - Uri: "some/url", - ArtifactVersion: uInt32Ptr(1), - Requirements: []string{"some requirements"}, - MemoryBytes: uInt64Ptr(2), - Server: ptr.String("some server"), - Parameters: []*pb.ParameterSpec{{ - Name: "some name", - Value: "some value", - }}, - ModelRuntimeInfo: &pb.ModelRuntimeInfo{ - ModelRuntimeInfo: &pb.ModelRuntimeInfo_Mlserver{ - Mlserver: &pb.MLServerModelSettings{ParallelWorkers: 2}, - }, - }, - ModelSpec: &pb.ModelSpec_Explainer{ - Explainer: &pb.ExplainerSpec{ - Type: "some type", - ModelRef: ptr.String("some model ref"), - PipelineRef: ptr.String("some pipeline ref"), - }, - }, - }, - }, - } - }, - validate: func(t *testing.T, original, copied *ModelVersion) { - RegisterTestingT(t) - Expect(copied.modelDefn).ToNot(BeNil()) - - // Verify it's a deep copy (different pointers) - Expect(copied.modelDefn).ToNot(BeIdenticalTo(original.modelDefn)) - - // Verify proto equality - Expect(proto.Equal(original.modelDefn, copied.modelDefn)).To(BeTrue()) - Expect(copied.version).To(Equal(original.version)) - Expect(copied.server).To(Equal(original.server)) - Expect(copied.state).To(Equal(original.state)) - Expect(copied.replicas).To(Equal(original.replicas)) - }, - }, - { - name: "with empty replicas map", - setupMV: func() *ModelVersion { - return &ModelVersion{ - version: 100, - server: "empty-replica-server", - state: ModelStatus{ - State: ModelAvailable, - Reason: "some reason", - AvailableReplicas: 1, - UnavailableReplicas: 2, - DrainingReplicas: 3, - Timestamp: time.Now(), - }, - replicas: make(map[int]ReplicaStatus), - } - }, - validate: func(t *testing.T, original, copied *ModelVersion) { - RegisterTestingT(t) - Expect(copied.replicas).ToNot(BeNil()) - Expect(len(copied.replicas)).To(Equal(0)) - Expect(copied.state).To(Equal(original.state)) - Expect(&copied.replicas).ToNot(BeIdenticalTo(&original.replicas)) - }, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - original := tt.setupMV() - copied := original.DeepCopy() - tt.validate(t, original, copied) - }) - } -} - -func TestReplicaStateToString(t *testing.T) { - for _, state := range replicaStates { - _ = state.String() - } -} - func TestCleanCapabilities(t *testing.T) { g := NewGomegaWithT(t) @@ -216,222 +39,3 @@ func TestCleanCapabilities(t *testing.T) { }) } } - -func TestCreateSnapshot(t *testing.T) { - g := NewGomegaWithT(t) - - server := &Server{ - name: "test", - replicas: map[int]*ServerReplica{ - 0: { - inferenceSvc: "svc", - loadedModels: map[ModelVersionID]bool{ - {Name: "model1", Version: 1}: true, - {Name: "model2", Version: 2}: true, - }, - loadingModels: map[ModelVersionID]bool{ - {Name: "model10", Version: 1}: true, - {Name: "model20", Version: 2}: true, - }, - }, - }, - kubernetesMeta: &pb.KubernetesMeta{Namespace: "default"}, - } - - snapshot := server.CreateSnapshot(false, true) - - g.Expect(snapshot.Replicas[0].loadedModels).To(Equal( - map[ModelVersionID]bool{ - {Name: "model1", Version: 1}: true, - {Name: "model2", Version: 2}: true, - }, - )) - - g.Expect(snapshot.Replicas[0].loadingModels).To(Equal( - map[ModelVersionID]bool{ - {Name: "model10", Version: 1}: true, - {Name: "model20", Version: 2}: true, - }, - )) - - server.replicas[1] = &ServerReplica{ - inferenceSvc: "svc", - loadedModels: map[ModelVersionID]bool{ - {Name: "model3", Version: 1}: true, - {Name: "model4", Version: 2}: true, - }, - } - server.name = "foo" - server.kubernetesMeta.Namespace = "test" - - g.Expect(snapshot.Name).To(Equal("test")) - g.Expect(len(snapshot.Replicas)).To(Equal(1)) - g.Expect(snapshot.KubernetesMeta.Namespace).To(Equal("default")) - -} - -func TestLoadedModel(t *testing.T) { - g := NewGomegaWithT(t) - - const ( - add int = iota - remove - ) - - type test struct { - name string - op int - model string - version int - state ModelReplicaState - loadedModels map[ModelVersionID]bool - uniqueLoadedModels map[string]bool - loadingModels map[ModelVersionID]bool - expectedLoadedModels map[ModelVersionID]bool - expectedLoadingModels map[ModelVersionID]bool - expectedUniqueLoadedModels map[string]bool - } - - tests := []test{ - { - name: "add loading", - op: add, - model: "dummy", - version: 1, - state: Loading, - loadedModels: map[ModelVersionID]bool{}, - loadingModels: map[ModelVersionID]bool{ - {Name: "dummy", Version: 1}: true, // we should already have an entry from an earlier load request - }, - uniqueLoadedModels: map[string]bool{}, - expectedLoadedModels: map[ModelVersionID]bool{}, - expectedUniqueLoadedModels: map[string]bool{}, - expectedLoadingModels: map[ModelVersionID]bool{ - {Name: "dummy", Version: 1}: true, - }, - }, - { - name: "add loaded", - op: add, - model: "dummy", - version: 1, - state: Loaded, - loadedModels: map[ModelVersionID]bool{}, - loadingModels: map[ModelVersionID]bool{ - {Name: "dummy", Version: 1}: true, // we should transition from loading to loaded - }, - uniqueLoadedModels: map[string]bool{}, - expectedLoadedModels: map[ModelVersionID]bool{ - {Name: "dummy", Version: 1}: true, - }, - expectedUniqueLoadedModels: map[string]bool{ - "dummy": true, - }, - expectedLoadingModels: map[ModelVersionID]bool{}, - }, - { - name: "add loading - new version", - op: add, - model: "dummy", - version: 2, - state: Loading, - loadedModels: map[ModelVersionID]bool{ - {Name: "dummy", Version: 1}: true, - }, - loadingModels: map[ModelVersionID]bool{}, - uniqueLoadedModels: map[string]bool{ - "dummy": true, - }, - expectedLoadedModels: map[ModelVersionID]bool{ - {Name: "dummy", Version: 1}: true, - }, - expectedUniqueLoadedModels: map[string]bool{ - "dummy": true, - }, - expectedLoadingModels: map[ModelVersionID]bool{ - {Name: "dummy", Version: 2}: true, - }, - }, - { - name: "add loaded- new version", - op: add, - model: "dummy", - version: 2, - state: Loaded, - loadedModels: map[ModelVersionID]bool{ - {Name: "dummy", Version: 1}: true, - }, - loadingModels: map[ModelVersionID]bool{ - {Name: "dummy", Version: 2}: true, // we should transition from loading to loaded - }, - uniqueLoadedModels: map[string]bool{ - "dummy": true, - }, - expectedLoadedModels: map[ModelVersionID]bool{ - {Name: "dummy", Version: 1}: true, - {Name: "dummy", Version: 2}: true, - }, - expectedUniqueLoadedModels: map[string]bool{ - "dummy": true, - }, - expectedLoadingModels: map[ModelVersionID]bool{}, - }, - { - name: "remove with early version", - op: remove, - model: "dummy", - version: 2, - loadedModels: map[ModelVersionID]bool{ - {Name: "dummy", Version: 1}: true, - {Name: "dummy", Version: 2}: true, - }, - loadingModels: map[ModelVersionID]bool{}, - uniqueLoadedModels: map[string]bool{ - "dummy": true, - }, - expectedLoadedModels: map[ModelVersionID]bool{ - {Name: "dummy", Version: 1}: true, - }, - expectedUniqueLoadedModels: map[string]bool{ - "dummy": true, - }, - expectedLoadingModels: map[ModelVersionID]bool{}, - }, - { - name: "remove", - op: remove, - model: "dummy", - version: 1, - loadedModels: map[ModelVersionID]bool{ - {Name: "dummy", Version: 1}: true, - }, - loadingModels: map[ModelVersionID]bool{}, - uniqueLoadedModels: map[string]bool{ - "dummy": true, - }, - expectedLoadedModels: map[ModelVersionID]bool{}, - expectedUniqueLoadedModels: map[string]bool{}, - expectedLoadingModels: map[ModelVersionID]bool{}, - }, - } - - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - server := ServerReplica{ - inferenceSvc: "svc", - loadedModels: test.loadedModels, - loadingModels: test.loadingModels, - uniqueLoadedModels: test.uniqueLoadedModels, - } - - if test.op == add { - server.addModelVersion(test.model, uint32(test.version), test.state) - } else { - server.deleteModelVersion(test.model, uint32(test.version)) - } - g.Expect(server.loadedModels).To(Equal(test.expectedLoadedModels)) - g.Expect(server.loadingModels).To(Equal(test.expectedLoadingModels)) - g.Expect(server.uniqueLoadedModels).To(Equal(test.expectedUniqueLoadedModels)) - }) - } -} diff --git a/scheduler/pkg/store/mock/store.go b/scheduler/pkg/store/mock/store.go index 1b849c9181..61a6581347 100644 --- a/scheduler/pkg/store/mock/store.go +++ b/scheduler/pkg/store/mock/store.go @@ -1,3 +1,12 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed BY +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + // Code generated by MockGen. DO NOT EDIT. // Source: ./api.go // diff --git a/scheduler/pkg/store/modelreplicastate_string.go b/scheduler/pkg/store/modelreplicastate_string.go deleted file mode 100644 index 5d5f876b71..0000000000 --- a/scheduler/pkg/store/modelreplicastate_string.go +++ /dev/null @@ -1,44 +0,0 @@ -/* -Copyright (c) 2024 Seldon Technologies Ltd. - -Use of this software is governed BY -(1) the license included in the LICENSE file or -(2) if the license included in the LICENSE file is the Business Source License 1.1, -the Change License after the Change Date as each is defined in accordance with the LICENSE file. -*/ - -// Code generated by "stringer -type=ModelReplicaState"; DO NOT EDIT. - -package store - -import "strconv" - -func _() { - // An "invalid array index" compiler error signifies that the constant values have changed. - // Re-run the stringer command to generate them again. - var x [1]struct{} - _ = x[ModelReplicaStateUnknown-0] - _ = x[LoadRequested-1] - _ = x[Loading-2] - _ = x[Loaded-3] - _ = x[LoadFailed-4] - _ = x[UnloadEnvoyRequested-5] - _ = x[UnloadRequested-6] - _ = x[Unloading-7] - _ = x[Unloaded-8] - _ = x[UnloadFailed-9] - _ = x[Available-10] - _ = x[LoadedUnavailable-11] - _ = x[Draining-12] -} - -const _ModelReplicaState_name = "ModelReplicaStateUnknownLoadRequestedLoadingLoadedLoadFailedUnloadEnvoyRequestedUnloadRequestedUnloadingUnloadedUnloadFailedAvailableLoadedUnavailableDraining" - -var _ModelReplicaState_index = [...]uint8{0, 24, 37, 44, 50, 60, 80, 95, 104, 112, 124, 133, 150, 158} - -func (i ModelReplicaState) String() string { - if i >= ModelReplicaState(len(_ModelReplicaState_index)-1) { - return "ModelReplicaState(" + strconv.FormatInt(int64(i), 10) + ")" - } - return _ModelReplicaState_name[_ModelReplicaState_index[i]:_ModelReplicaState_index[i+1]] -} diff --git a/scheduler/pkg/store/modelstate_string.go b/scheduler/pkg/store/modelstate_string.go deleted file mode 100644 index 7ed0bce200..0000000000 --- a/scheduler/pkg/store/modelstate_string.go +++ /dev/null @@ -1,42 +0,0 @@ -/* -Copyright (c) 2024 Seldon Technologies Ltd. - -Use of this software is governed BY -(1) the license included in the LICENSE file or -(2) if the license included in the LICENSE file is the Business Source License 1.1, -the Change License after the Change Date as each is defined in accordance with the LICENSE file. -*/ - -// Code generated by "stringer -type=ModelState"; DO NOT EDIT. - -package store - -import "strconv" - -func _() { - // An "invalid array index" compiler error signifies that the constant values have changed. - // Re-run the stringer command to generate them again. - var x [1]struct{} - _ = x[ModelStateUnknown-0] - _ = x[ModelProgressing-1] - _ = x[ModelAvailable-2] - _ = x[ModelFailed-3] - _ = x[ModelTerminating-4] - _ = x[ModelTerminated-5] - _ = x[ModelTerminateFailed-6] - _ = x[ScheduleFailed-7] - _ = x[ModelScaledDown-8] - _ = x[ModelCreate-9] - _ = x[ModelTerminate-10] -} - -const _ModelState_name = "ModelStateUnknownModelProgressingModelAvailableModelFailedModelTerminatingModelTerminatedModelTerminateFailedScheduleFailedModelScaledDownModelCreateModelTerminate" - -var _ModelState_index = [...]uint8{0, 17, 33, 47, 58, 74, 89, 109, 123, 138, 149, 163} - -func (i ModelState) String() string { - if i >= ModelState(len(_ModelState_index)-1) { - return "ModelState(" + strconv.FormatInt(int64(i), 10) + ")" - } - return _ModelState_name[_ModelState_index[i]:_ModelState_index[i+1]] -} diff --git a/scheduler/pkg/store/pipeline/db_test.go b/scheduler/pkg/store/pipeline/db_test.go index 2f5b8bdf56..266f7b1685 100644 --- a/scheduler/pkg/store/pipeline/db_test.go +++ b/scheduler/pkg/store/pipeline/db_test.go @@ -17,11 +17,13 @@ import ( "github.com/dgraph-io/badger/v3" "github.com/google/go-cmp/cmp" . "github.com/onsi/gomega" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" - "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/mock" log "github.com/sirupsen/logrus" "go.uber.org/mock/gomock" "google.golang.org/protobuf/proto" + + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + + "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/mock" ) func TestSaveWithTTL(t *testing.T) { diff --git a/scheduler/pkg/store/pipeline/status.go b/scheduler/pkg/store/pipeline/status.go index 8a151a47d0..2c52cb404b 100644 --- a/scheduler/pkg/store/pipeline/status.go +++ b/scheduler/pkg/store/pipeline/status.go @@ -14,9 +14,10 @@ import ( "fmt" "sync" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/sirupsen/logrus" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" ) diff --git a/scheduler/pkg/store/pipeline/store.go b/scheduler/pkg/store/pipeline/store.go index 8b88e76e83..dcb5e43137 100644 --- a/scheduler/pkg/store/pipeline/store.go +++ b/scheduler/pkg/store/pipeline/store.go @@ -17,10 +17,10 @@ import ( "time" "github.com/mitchellh/copystructure" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/sirupsen/logrus" "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" "github.com/seldonio/seldon-core/scheduler/v2/pkg/store" diff --git a/scheduler/pkg/store/pipeline/store_test.go b/scheduler/pkg/store/pipeline/store_test.go index ae7cd5cf77..19c4a6a5f8 100644 --- a/scheduler/pkg/store/pipeline/store_test.go +++ b/scheduler/pkg/store/pipeline/store_test.go @@ -15,14 +15,14 @@ import ( "time" . "github.com/onsi/gomega" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" - "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/mock" "github.com/sirupsen/logrus" "go.uber.org/mock/gomock" "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/store/mock" ) func TestGetPipelinesPipelineGwStatus(t *testing.T) { diff --git a/scheduler/pkg/store/pipeline/utils.go b/scheduler/pkg/store/pipeline/utils.go index d39dfba5c3..fa30cdae22 100644 --- a/scheduler/pkg/store/pipeline/utils.go +++ b/scheduler/pkg/store/pipeline/utils.go @@ -15,10 +15,10 @@ import ( "strings" "github.com/rs/xid" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" "google.golang.org/protobuf/types/known/timestamppb" "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" ) func CreateProtoFromPipelineVersion(pv *PipelineVersion) *scheduler.Pipeline { diff --git a/scheduler/pkg/store/test_memory_hack.go b/scheduler/pkg/store/test_memory_hack.go index f75dbaae3c..cdddaee8e8 100644 --- a/scheduler/pkg/store/test_memory_hack.go +++ b/scheduler/pkg/store/test_memory_hack.go @@ -15,9 +15,10 @@ import ( "fmt" "testing" - "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" log "github.com/sirupsen/logrus" + "github.com/seldonio/seldon-core/apis/go/v2/mlops/scheduler/db" + "github.com/seldonio/seldon-core/scheduler/v2/pkg/coordinator" ) diff --git a/scheduler/pkg/util/model_versions.go b/scheduler/pkg/util/model_versions.go index 029c1f8ba6..08ceec97ad 100644 --- a/scheduler/pkg/util/model_versions.go +++ b/scheduler/pkg/util/model_versions.go @@ -1,3 +1,12 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed BY +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + package util import ( diff --git a/scheduler/pkg/util/server_replica.go b/scheduler/pkg/util/server_replica.go index db110a8a6a..7c5a94ba4a 100644 --- a/scheduler/pkg/util/server_replica.go +++ b/scheduler/pkg/util/server_replica.go @@ -1,3 +1,12 @@ +/* +Copyright (c) 2024 Seldon Technologies Ltd. + +Use of this software is governed BY +(1) the license included in the LICENSE file or +(2) if the license included in the LICENSE file is the Business Source License 1.1, +the Change License after the Change Date as each is defined in accordance with the LICENSE file. +*/ + package util import (