From e74f6b07ae2d09557d96841ca0fd9dfa74fae434 Mon Sep 17 00:00:00 2001 From: Tatsumi Gamou <47162587+gamoutatsumi@users.noreply.github.com> Date: Tue, 17 Dec 2024 16:41:44 +0900 Subject: [PATCH] support image selectable (pool-agent) (#48) --- pool-agent/README.md | 23 ++- pool-agent/cmd/agent.go | 291 +++++++++++++++++++++----------------- pool-agent/cmd/config.go | 23 +-- pool-agent/cmd/create.go | 20 +-- pool-agent/cmd/metrics.go | 4 +- pool-agent/go.mod | 1 + pool-agent/go.sum | 2 + 7 files changed, 204 insertions(+), 160 deletions(-) diff --git a/pool-agent/README.md b/pool-agent/README.md index 4fa6e8c..98dcc16 100644 --- a/pool-agent/README.md +++ b/pool-agent/README.md @@ -10,7 +10,6 @@ configuration file format is toml ```toml # sample configuration -image_alias = "https://example.com/container-image" [[resource_types_map]] name = "large" cpu = 2 @@ -31,12 +30,22 @@ memory = "120GB" name = "4xlarge" cpu = 16 memory = "160GB" -[resource_types_counts] -large = 10 -xlarge = 10 -2xlarge = 10 -3xlarge = 10 -4xlarge = 10 +[config.ubuntu2404] +image_alias = "https://example.com/container-image-noble" +[config.ubuntu2404.resource_types_counts] +large = 1 +xlarge = 1 +2xlarge = 1 +3xlarge = 1 +4xlarge = 1 +[config.ubuntu2204] +image_alias = "https://example.com/container-image-jammy" +[config.ubuntu2204.resource_types_counts] +large = 1 +xlarge = 1 +2xlarge = 1 +3xlarge = 1 +4xlarge = 1 ``` ### command line options diff --git a/pool-agent/cmd/agent.go b/pool-agent/cmd/agent.go index db82a5c..69d3504 100644 --- a/pool-agent/cmd/agent.go +++ b/pool-agent/cmd/agent.go @@ -15,18 +15,11 @@ import ( slm "github.com/whywaita/shoes-lxd-multi/server/pkg/api" ) -// Agent is an agent for pool mode. -type Agent struct { +type AgentConfig struct { ImageAlias string InstanceSource api.InstanceSource - ResourceTypesMap []ResourceTypesMap ResourceTypesCounts ResourceTypesCounts - Client lxd.InstanceServer - - CheckInterval time.Duration - WaitIdleTime time.Duration - ZombieAllowTime time.Duration creatingInstances map[string]instances deletingInstances instances @@ -34,7 +27,17 @@ type Agent struct { Hash string CreatedAt time.Time } - registry *prometheus.Registry +} + +// Agent is an agent for pool mode. +type Agent struct { + Config map[string]*AgentConfig + CheckInterval time.Duration + WaitIdleTime time.Duration + ZombieAllowTime time.Duration + registry *prometheus.Registry + ResourceTypesMap ResourceTypesMap + Client lxd.InstanceServer } var ( @@ -43,80 +46,99 @@ var ( type instances map[string]struct{} -func newAgent(ctx context.Context) (*Agent, error) { - conf, err := LoadConfig() +func genAgentConfig(config Config) *AgentConfig { + s, err := slm.ParseAlias(config.ImageAlias) if err != nil { - return nil, fmt.Errorf("load config: %w", err) + return nil } - source, err := slm.ParseAlias(conf.ImageAlias) - if err != nil { - return nil, err + s.Server = "" + creatingInstances := make(map[string]instances) + for k, v := range config.ResourceTypesCounts { + configuredInstancesCount.WithLabelValues(k, config.ImageAlias).Set(float64(v)) + creatingInstances[k] = make(instances) + } + return &AgentConfig{ + ImageAlias: config.ImageAlias, + InstanceSource: *s, + ResourceTypesCounts: config.ResourceTypesCounts, + currentImage: struct { + Hash string + CreatedAt time.Time + }{Hash: "", CreatedAt: time.Time{}}, + deletingInstances: make(instances), + creatingInstances: creatingInstances, } - source.Server = "" +} - c, err := lxd.ConnectLXDUnixWithContext(ctx, "", &lxd.ConnectionArgs{}) +func newAgent(ctx context.Context) (*Agent, error) { + confmap, err := LoadConfig() if err != nil { - return nil, fmt.Errorf("connect lxd: %w", err) + return nil, fmt.Errorf("load config: %w", err) + } + ac := make(map[string]*AgentConfig, len(confmap.Config)) + for version, conf := range confmap.Config { + agentConfig := genAgentConfig(conf) + if agentConfig == nil { + return nil, fmt.Errorf("failed to generate agent config") + } + ac[version] = agentConfig } checkInterval, waitIdleTime, zombieAllowTime, err := LoadParams() if err != nil { return nil, fmt.Errorf("load params: %w", err) } - creatingInstances := make(map[string]instances) - for _, rt := range conf.ResourceTypesMap { - creatingInstances[rt.Name] = make(instances) - } - registry := prometheus.NewRegistry() registry.Register(configuredInstancesCount) registry.Register(lxdInstances) - for k, v := range conf.ResourceTypesCounts { - configuredInstancesCount.WithLabelValues(k).Set(float64(v)) + c, err := lxd.ConnectLXDUnixWithContext(ctx, "", &lxd.ConnectionArgs{}) + if err != nil { + return nil, fmt.Errorf("connect lxd: %w", err) } - agent := &Agent{ - ImageAlias: conf.ImageAlias, - InstanceSource: *source, - - ResourceTypesMap: conf.ResourceTypesMap, - ResourceTypesCounts: conf.ResourceTypesCounts, - Client: c, - - CheckInterval: checkInterval, - WaitIdleTime: waitIdleTime, - ZombieAllowTime: zombieAllowTime, - currentImage: struct { - Hash string - CreatedAt time.Time - }{Hash: "", CreatedAt: time.Time{}}, - creatingInstances: creatingInstances, - deletingInstances: make(instances), - registry: registry, + agent := &Agent{ + Config: ac, + Client: c, + CheckInterval: checkInterval, + WaitIdleTime: waitIdleTime, + ZombieAllowTime: zombieAllowTime, + registry: registry, + ResourceTypesMap: confmap.ResourceTypesMap, } + return agent, nil } func (a *Agent) reloadConfig() error { - conf, err := LoadConfig() + confmap, err := LoadConfig() if err != nil { return fmt.Errorf("reload config: %w", err) } - for k, v := range conf.ResourceTypesCounts { - configuredInstancesCount.WithLabelValues(k).Set(float64(v)) - } - - if conf.ImageAlias != a.ImageAlias { - source, err := slm.ParseAlias(conf.ImageAlias) - if err != nil { - return fmt.Errorf("parse image alias: %w", err) + for version, conf := range confmap.Config { + for k, v := range conf.ResourceTypesCounts { + configuredInstancesCount.WithLabelValues(k, conf.ImageAlias).Set(float64(v)) + } + if _, ok := a.Config[version]; !ok { + agentConfig := genAgentConfig(conf) + if agentConfig == nil { + return fmt.Errorf("failed to generate agent config") + } + a.Config[version] = agentConfig + continue + } else { + s, err := slm.ParseAlias(conf.ImageAlias) + if err != nil { + return err + } + s.Server = "" + a.Config[version] = &AgentConfig{ + ImageAlias: conf.ImageAlias, + InstanceSource: *s, + ResourceTypesCounts: conf.ResourceTypesCounts, + } } - a.InstanceSource = *source - a.InstanceSource.Server = "" - a.ImageAlias = conf.ImageAlias } - a.ResourceTypesMap = conf.ResourceTypesMap - a.ResourceTypesCounts = conf.ResourceTypesCounts + a.ResourceTypesMap = confmap.ResourceTypesMap return nil } @@ -144,16 +166,16 @@ func (a *Agent) Run(ctx context.Context, sigHupCh chan os.Signal) error { } } -func (a *Agent) countPooledInstances(instances []api.Instance, resourceTypeName string) int { +func (a *Agent) countPooledInstances(instances []api.Instance, resourceTypeName, version string) int { count := 0 for _, i := range instances { if i.StatusCode != api.Frozen { continue } - if i.Config[configKeyImageAlias] != a.ImageAlias { + if i.Config[configKeyResourceType] != resourceTypeName { continue } - if i.Config[configKeyResourceType] != resourceTypeName { + if i.Config[configKeyImageAlias] != a.Config[version].ImageAlias { continue } if _, ok := i.Config[configKeyRunnerName]; ok { @@ -181,74 +203,76 @@ func (a *Agent) adjustInstancePool() error { return fmt.Errorf("get instances: %w", err) } - toDelete := []string{} - - for _, rt := range a.ResourceTypesMap { - current := a.countPooledInstances(s, rt.Name) - creating := len(a.creatingInstances[rt.Name]) - rtCount, ok := a.ResourceTypesCounts[rt.Name] - if !ok { - toDelete = append(toDelete, rt.Name) - continue - } else if rtCount == 0 { - toDelete = append(toDelete, rt.Name) - continue - } - createCount := rtCount - current - creating - if createCount < 1 { - continue - } - slog.Info("Create instances", "count", createCount, "flavor", rt.Name) - for i := 0; i < createCount; i++ { - name, err := generateInstanceName() - if err != nil { - return fmt.Errorf("generate instance name: %w", err) + for version, config := range a.Config { + toDelete := []string{} + for rtName, rt := range a.ResourceTypesMap { + current := a.countPooledInstances(s, rtName, version) + creating := len(config.creatingInstances[rtName]) + rtCount, ok := config.ResourceTypesCounts[rtName] + if !ok { + toDelete = append(toDelete, rtName) + continue + } else if rtCount == 0 { + toDelete = append(toDelete, rtName) + continue } - l := slog.With("instance", name, "flavor", rt.Name) - a.creatingInstances[rt.Name][name] = struct{}{} + createCount := rtCount - current - creating + if createCount < 1 { + continue + } + slog.Info("Create instances", "count", createCount, "flavor", rtName) + for i := 0; i < createCount; i++ { + iname, err := generateInstanceName() + if err != nil { + return fmt.Errorf("generate instance name: %w", err) + } + l := slog.With("instance", iname, "flavor", rtName, "version", version) + config.creatingInstances[rtName][iname] = struct{}{} - defer delete(a.creatingInstances[rt.Name], name) + defer delete(config.creatingInstances[rtName], iname) - if err := a.createInstance(name, rt, l); err != nil { - l.Error("failed to create instance", "err", err.Error()) + if err := a.createInstance(iname, rtName, rt, version, l); err != nil { + l.Error("failed to create instance", "err", err.Error()) + } } } - } - - for _, i := range s { - if i.Config[configKeyResourceType] == "" || i.Config[configKeyImageAlias] == "" { - continue - } - l := slog.With("instance", i.Name) - if _, ok := a.ResourceTypesCounts[i.Config[configKeyResourceType]]; !ok { - toDelete = append(toDelete, i.Config[configKeyResourceType]) - } - for _, rt := range toDelete { - if i.Config[configKeyResourceType] == rt { - l := l.With("flavor", rt) - l.Info("Deleting disabled flavor instance") - if err := a.deleteInstance(i); err != nil { - l.Error("failed to delete instance", "err", err.Error()) - continue + for _, i := range s { + if i.Config[configKeyResourceType] == "" || i.Config[configKeyImageAlias] != config.ImageAlias { + continue + } + l := slog.With("instance", i.Name, "version", version) + if _, ok := config.ResourceTypesCounts[i.Config[configKeyResourceType]]; !ok { + if i.Config[configKeyImageAlias] == config.ImageAlias { + toDelete = append(toDelete, i.Config[configKeyResourceType]) } - l.Info("Deleted disabled flavor instance") } - } - if a.isZombieInstance(i) { - l.Info("Deleting zombie instance") - if err := a.deleteInstance(i); err != nil { - l.Error("failed to delete zombie instance", "err", err.Error()) + for _, rt := range toDelete { + if i.Config[configKeyResourceType] == rt { + l := l.With("flavor", rt) + l.Info("Deleting disabled flavor instance") + if err := a.deleteInstance(i, version); err != nil { + l.Error("failed to delete instance", "err", err.Error()) + continue + } + l.Info("Deleted disabled flavor instance") + } } - l.Info("Deleted zombie instance") - } - if isOld, err := a.isOldImageInstance(i); err != nil { - l.Error("failed to check old image instance", "err", err.Error()) - } else if isOld { - l.Info("Deleting old image instance") - if err := a.deleteInstance(i); err != nil { - l.Error("failed to delete old image instance", "err", err.Error()) + if a.isZombieInstance(i, version) { + l.Info("Deleting zombie instance") + if err := a.deleteInstance(i, version); err != nil { + l.Error("failed to delete zombie instance", "err", err.Error()) + } + l.Info("Deleted zombie instance") + } + if isOld, err := a.isOldImageInstance(i, version); err != nil { + l.Error("failed to check old image instance", "err", err.Error()) + } else if isOld { + l.Info("Deleting old image instance") + if err := a.deleteInstance(i, version); err != nil { + l.Error("failed to delete old image instance", "err", err.Error()) + } + l.Info("Deleted old image instance") } - l.Info("Deleted old image instance") } } @@ -283,19 +307,19 @@ func (a *Agent) collectMetrics() error { } lxdInstances.Reset() for _, i := range s { - lxdInstances.WithLabelValues(i.Status, i.Config[configKeyResourceType]).Inc() + lxdInstances.WithLabelValues(i.Status, i.Config[configKeyResourceType], i.Config[configKeyImageAlias]).Inc() } return nil } -func (a *Agent) isZombieInstance(i api.Instance) bool { +func (a *Agent) isZombieInstance(i api.Instance, version string) bool { if i.StatusCode == api.Frozen { return false } if _, ok := i.Config[configKeyRunnerName]; ok { return false } - if i.Config[configKeyImageAlias] != a.ImageAlias { + if i.Config[configKeyImageAlias] != a.Config[version].ImageAlias { return false } if i.CreatedAt.Add(a.ZombieAllowTime).After(time.Now()) { @@ -303,37 +327,40 @@ func (a *Agent) isZombieInstance(i api.Instance) bool { } if rt, ok := i.Config[configKeyResourceType]; !ok { return false - } else if _, ok := a.creatingInstances[rt][i.Name]; ok { + } else if _, ok := a.Config[version].creatingInstances[rt][i.Name]; ok { return false } return true } -func (a *Agent) isOldImageInstance(i api.Instance) (bool, error) { +func (a *Agent) isOldImageInstance(i api.Instance, version string) (bool, error) { baseImage, ok := i.Config["volatile.base_image"] if !ok { return false, errors.New("Failed to get volatile.base_image") } - if baseImage != a.currentImage.Hash { - if i.CreatedAt.Before(a.currentImage.CreatedAt) { + if i.Config[configKeyImageAlias] != a.Config[version].ImageAlias { + return false, nil + } + if baseImage != a.Config[version].currentImage.Hash { + if i.CreatedAt.Before(a.Config[version].currentImage.CreatedAt) { if i.StatusCode == api.Frozen { return true, nil } return false, nil } - a.currentImage.Hash = baseImage - a.currentImage.CreatedAt = i.CreatedAt + a.Config[version].currentImage.Hash = baseImage + a.Config[version].currentImage.CreatedAt = i.CreatedAt return false, nil } return false, nil } -func (a *Agent) deleteInstance(i api.Instance) error { - if _, ok := a.deletingInstances[i.Name]; ok { +func (a *Agent) deleteInstance(i api.Instance, version string) error { + if _, ok := a.Config[version].deletingInstances[i.Name]; ok { return nil } - a.deletingInstances[i.Name] = struct{}{} - defer delete(a.deletingInstances, i.Name) + a.Config[version].deletingInstances[i.Name] = struct{}{} + defer delete(a.Config[version].deletingInstances, i.Name) _, etag, err := a.Client.GetInstance(i.Name) if err != nil { return fmt.Errorf("get instance: %w", err) diff --git a/pool-agent/cmd/config.go b/pool-agent/cmd/config.go index 9f3654e..1047134 100644 --- a/pool-agent/cmd/config.go +++ b/pool-agent/cmd/config.go @@ -6,37 +6,42 @@ import ( "time" "github.com/lxc/lxd/shared/api" - slm "github.com/whywaita/shoes-lxd-multi/server/pkg/api" "github.com/pelletier/go-toml/v2" + slm "github.com/whywaita/shoes-lxd-multi/server/pkg/api" ) // Config is config map for pool agent. type Config struct { ImageAlias string `toml:"image_alias"` - ResourceTypesMap []ResourceTypesMap `toml:"resource_types_map"` ResourceTypesCounts ResourceTypesCounts `toml:"resource_types_counts"` } -// ResourceTypesMap is resource configuration for pool mode. -type ResourceTypesMap struct { - Name string `toml:"name"` +// ConfigPerVersion is config map for pool agent per version. +type ConfigMap struct { + ResourceTypesMap ResourceTypesMap `toml:"resource_types_map"` + Config map[string]Config `toml:"config"` +} +type resourceType struct { CPUCore int `toml:"cpu"` Memory string `toml:"memory"` } +// ResourceTypesMap is resource configuration for pool mode. +type ResourceTypesMap map[string]resourceType + // ResourceTypesCounts is counts for resouce types. type ResourceTypesCounts map[string]int // LoadConfig LoadConfig loads config from configPath -func LoadConfig() (Config, error) { +func LoadConfig() (ConfigMap, error) { f, err := os.ReadFile(configPath) if err != nil { - return Config{}, fmt.Errorf("failed read config file: %w", err) + return ConfigMap{}, fmt.Errorf("failed read config file: %w", err) } - var s Config + var s ConfigMap if err := toml.Unmarshal(f, &s); err != nil { - return Config{}, fmt.Errorf("parse config file: %w", err) + return ConfigMap{}, fmt.Errorf("parse config file: %w", err) } return s, nil } diff --git a/pool-agent/cmd/create.go b/pool-agent/cmd/create.go index f1c6c96..83b1f96 100644 --- a/pool-agent/cmd/create.go +++ b/pool-agent/cmd/create.go @@ -10,10 +10,10 @@ import ( "github.com/lxc/lxd/shared/api" ) -func (a *Agent) createInstance(name string, rt ResourceTypesMap, l *slog.Logger) error { +func (a *Agent) createInstance(iname, rtName string, rt resourceType, version string, l *slog.Logger) error { l.Info("Creating instance") op, err := a.Client.CreateInstance(api.InstancesPost{ - Name: name, + Name: iname, InstancePut: api.InstancePut{ Config: map[string]string{ "limits.cpu": strconv.Itoa(rt.CPUCore), @@ -25,8 +25,8 @@ func (a *Agent) createInstance(name string, rt ResourceTypesMap, l *slog.Logger) "lxc.cgroup.devices.allow = a", "lxc.cap.drop=", }, "\n"), - configKeyImageAlias: a.ImageAlias, - configKeyResourceType: rt.Name, + configKeyImageAlias: a.Config[version].ImageAlias, + configKeyResourceType: rtName, }, Devices: map[string]map[string]string{ "kmsg": { @@ -36,7 +36,7 @@ func (a *Agent) createInstance(name string, rt ResourceTypesMap, l *slog.Logger) }, }, }, - Source: a.InstanceSource, + Source: a.Config[version].InstanceSource, }) if err != nil { return fmt.Errorf("create: %w", err) @@ -46,7 +46,7 @@ func (a *Agent) createInstance(name string, rt ResourceTypesMap, l *slog.Logger) } l.Info("Starting instance") - op, err = a.Client.UpdateInstanceState(name, api.InstanceStatePut{ + op, err = a.Client.UpdateInstanceState(iname, api.InstanceStatePut{ Action: "start", Timeout: -1, }, "") @@ -58,7 +58,7 @@ func (a *Agent) createInstance(name string, rt ResourceTypesMap, l *slog.Logger) } l.Info("Waiting system bus in instance") - op, err = a.Client.ExecInstance(name, api.InstanceExecPost{ + op, err = a.Client.ExecInstance(iname, api.InstanceExecPost{ Command: []string{"bash", "-c", "until test -e /var/run/dbus/system_bus_socket; do sleep 0.5; done"}, }, nil) if err != nil { @@ -69,7 +69,7 @@ func (a *Agent) createInstance(name string, rt ResourceTypesMap, l *slog.Logger) } l.Info("Waiting system running for instance") - op, err = a.Client.ExecInstance(name, api.InstanceExecPost{ + op, err = a.Client.ExecInstance(iname, api.InstanceExecPost{ Command: []string{"systemctl", "is-system-running", "--wait"}, }, nil) if err != nil { @@ -80,7 +80,7 @@ func (a *Agent) createInstance(name string, rt ResourceTypesMap, l *slog.Logger) } l.Info("Disabling systemd service watchdogs in instance") - op, err = a.Client.ExecInstance(name, api.InstanceExecPost{ + op, err = a.Client.ExecInstance(iname, api.InstanceExecPost{ Command: []string{"systemctl", "service-watchdogs", "no"}, }, nil) if err != nil { @@ -94,7 +94,7 @@ func (a *Agent) createInstance(name string, rt ResourceTypesMap, l *slog.Logger) time.Sleep(a.WaitIdleTime) l.Info("Freezing instance") - op, err = a.Client.UpdateInstanceState(name, api.InstanceStatePut{ + op, err = a.Client.UpdateInstanceState(iname, api.InstanceStatePut{ Action: "freeze", Timeout: -1, }, "") diff --git a/pool-agent/cmd/metrics.go b/pool-agent/cmd/metrics.go index 40984cf..4585df3 100644 --- a/pool-agent/cmd/metrics.go +++ b/pool-agent/cmd/metrics.go @@ -12,7 +12,7 @@ var ( Subsystem: "configured", Namespace: "pool_agent", }, - []string{"flavor"}, + []string{"flavor", "image_alias"}, ) lxdInstances = prometheus.NewGaugeVec( prometheus.GaugeOpts{ @@ -21,6 +21,6 @@ var ( Subsystem: "lxd", Namespace: "pool_agent", }, - []string{"status", "flavor"}, + []string{"status", "flavor", "image_alias"}, ) ) diff --git a/pool-agent/go.mod b/pool-agent/go.mod index d5b30e3..6a5acb6 100644 --- a/pool-agent/go.mod +++ b/pool-agent/go.mod @@ -5,6 +5,7 @@ go 1.22.0 toolchain go1.22.2 require ( + dario.cat/mergo v1.0.1 github.com/lxc/lxd v0.0.0-20220308034307-91f3610e71c1 github.com/pelletier/go-toml/v2 v2.2.1 github.com/pkg/errors v0.9.1 diff --git a/pool-agent/go.sum b/pool-agent/go.sum index 6e28afa..4ab3010 100644 --- a/pool-agent/go.sum +++ b/pool-agent/go.sum @@ -45,6 +45,8 @@ cloud.google.com/go/storage v1.5.0/go.mod h1:tpKbwo567HUNpVclU5sGELwQWBDZ8gh0Zeo cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohlUTyfDhBk= cloud.google.com/go/storage v1.8.0/go.mod h1:Wv1Oy7z6Yz3DshWRJFhqM/UCfaWIRTdp0RXyy7KQOVs= cloud.google.com/go/storage v1.10.0/go.mod h1:FLPqc6j+Ki4BU591ie1oL6qBQGu2Bl/tZ9ullr3+Kg0= +dario.cat/mergo v1.0.1 h1:Ra4+bf83h2ztPIQYNP99R6m+Y7KfnARDfID+a+vLl4s= +dario.cat/mergo v1.0.1/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk= dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=