diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 9392ad43c..6ec0b09df 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -13,6 +13,8 @@ jobs: cover: runs-on: ubuntu-latest steps: + - name: Check architecture + run: uname -m - name: Set up Go uses: actions/setup-go@v2 with: diff --git a/.golangci.yaml b/.golangci.yaml index 102857c18..faa3ffb90 100644 --- a/.golangci.yaml +++ b/.golangci.yaml @@ -4,9 +4,7 @@ run: linters: disable-all: true enable: - - deadcode - unused - - varcheck - ineffassign - goimports - gofmt @@ -15,7 +13,6 @@ linters: - unconvert - govet - errcheck - - structcheck - staticcheck linters-settings: diff --git a/cmd/craned/app/manager.go b/cmd/craned/app/manager.go index 57dfac9ef..b939e9c28 100644 --- a/cmd/craned/app/manager.go +++ b/cmd/craned/app/manager.go @@ -17,6 +17,7 @@ import ( clientgoscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" "k8s.io/client-go/scale" + "k8s.io/component-base/logs" "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/cache" @@ -52,6 +53,7 @@ import ( "github.com/gocrane/crane/pkg/recommendation" "github.com/gocrane/crane/pkg/server" serverconfig "github.com/gocrane/crane/pkg/server/config" + "github.com/gocrane/crane/pkg/utils" "github.com/gocrane/crane/pkg/utils/target" "github.com/gocrane/crane/pkg/webhooks" ) @@ -83,6 +85,7 @@ func NewManagerCommand(ctx context.Context) *cobra.Command { cmd.Flags().AddGoFlagSet(flag.CommandLine) opts.AddFlags(cmd.Flags()) + logs.AddFlags(cmd.Flags()) utilfeature.DefaultMutableFeatureGate.AddFlag(cmd.Flags()) return cmd @@ -143,7 +146,7 @@ func Run(ctx context.Context, opts *options.Options) error { } }() - initControllers(podOOMRecorder, mgr, opts, predictorMgr, historyDataSources[providers.PrometheusDataSource]) + initControllers(ctx, podOOMRecorder, mgr, opts, predictorMgr, historyDataSources[providers.PrometheusDataSource]) // initialize custom collector metrics initMetricCollector(mgr) runAll(ctx, mgr, predictorMgr, dataSourceProviders[providers.PrometheusDataSource], opts) @@ -256,6 +259,8 @@ func initDataSources(mgr ctrl.Manager, opts *options.Options) (map[providers.Dat hybridDataSources[providers.PrometheusDataSource] = provider realtimeDataSources[providers.PrometheusDataSource] = provider historyDataSources[providers.PrometheusDataSource] = provider + + utils.SetExtensionLabels(opts.DataSourcePromConfig.ExtensionLabels) } } return realtimeDataSources, historyDataSources, hybridDataSources @@ -266,7 +271,7 @@ func initPredictorManager(opts *options.Options, realtimeDataSources map[provide } // initControllers setup controllers with manager -func initControllers(oomRecorder oom.Recorder, mgr ctrl.Manager, opts *options.Options, predictorMgr predictor.Manager, historyDataSource providers.History) { +func initControllers(ctx context.Context, oomRecorder oom.Recorder, mgr ctrl.Manager, opts *options.Options, predictorMgr predictor.Manager, historyDataSource providers.History) { discoveryClientSet, err := discovery.NewDiscoveryClientForConfig(mgr.GetConfig()) if err != nil { klog.Exit(err, "Unable to create discover client") @@ -417,6 +422,13 @@ func initControllers(oomRecorder oom.Recorder, mgr ctrl.Manager, opts *options.O }).SetupWithManager(mgr); err != nil { klog.Exit(err, "unable to create controller", "controller", "RecommendationTriggerController") } + + checker := recommendationctrl.Checker{ + Client: mgr.GetClient(), + MonitorInterval: opts.MonitorInterval, + OutDateInterval: opts.OutDateInterval, + } + checker.Run(ctx.Done()) } // CnpController diff --git a/cmd/craned/app/options/options.go b/cmd/craned/app/options/options.go index e2094b54a..b33db94c0 100644 --- a/cmd/craned/app/options/options.go +++ b/cmd/craned/app/options/options.go @@ -67,6 +67,12 @@ type Options struct { // CacheUnstructured indicates whether to cache Unstructured objects. When enabled, it will speed up reading Unstructured objects, but will increase memory usage. CacheUnstructured bool + + // MonitorInterval is the interval for recommendation checker + MonitorInterval time.Duration + + // OutDateInterval is the checking interval for identify a recommendation is outdated + OutDateInterval time.Duration } // NewOptions builds an empty options. @@ -115,6 +121,7 @@ func (o *Options) AddFlags(flags *pflag.FlagSet) { flags.StringVar(&o.DataSourcePromConfig.AdapterConfigMapKey, "prometheus-adapter-configmap-key", "", "prometheus adapter-configmap key") flags.StringVar(&o.DataSourcePromConfig.AdapterConfig, "prometheus-adapter-config", "", "prometheus adapter-config path") flags.StringVar(&o.DataSourcePromConfig.AdapterExtensionLabels, "prometheus-adapter-extension-labels", "", "prometheus adapter extension-labels for expressionQuery") + flags.StringVar(&o.DataSourcePromConfig.ExtensionLabels, "extension-labels", "", "extension-labels for every prometheus query") flags.StringVar(&o.DataSourcePromConfig.Auth.Username, "prometheus-auth-username", "", "prometheus auth username") flags.StringVar(&o.DataSourcePromConfig.Auth.Password, "prometheus-auth-password", "", "prometheus auth password") flags.StringVar(&o.DataSourcePromConfig.Auth.BearerToken, "prometheus-auth-bearertoken", "", "prometheus auth bearertoken") @@ -139,4 +146,6 @@ func (o *Options) AddFlags(flags *pflag.FlagSet) { flags.IntVar(&o.OOMRecordMaxNumber, "oom-record-max-number", 10000, "Max number for oom records to store in configmap") flags.IntVar(&o.TimeSeriesPredictionMaxConcurrentReconciles, "time-series-prediction-max-concurrent-reconciles", 10, "Max concurrent reconciles for TimeSeriesPrediction controller") flags.BoolVar(&o.CacheUnstructured, "cache-unstructured", true, "whether to cache Unstructured objects. When enabled, it will speed up reading Unstructured objects but will increase memory usage") + flags.DurationVar(&o.MonitorInterval, "recommendation-monitor-interval", time.Hour, "interval for recommendation checker") + flags.DurationVar(&o.OutDateInterval, "recommendation-outdate-interval", 24*time.Hour, "interval for identify a recommendation is outdated") } diff --git a/deploy/craned/rbac.yaml b/deploy/craned/rbac.yaml index aa62c04d2..275648654 100644 --- a/deploy/craned/rbac.yaml +++ b/deploy/craned/rbac.yaml @@ -1,13 +1,128 @@ apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: craned + namespace: crane-system +rules: +- apiGroups: + - "" + resources: + - configmaps + - secrets + verbs: + - create +- apiGroups: + - "" + resourceNames: + - craned + resources: + - configmaps + verbs: + - get + - patch + - update +- apiGroups: + - "" + resourceNames: + - clusters-secret-store + resources: + - secrets + verbs: + - get +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - get + - patch + - update + - create +--- +apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: craned rules: - - apiGroups: [ '*' ] - resources: [ '*' ] - verbs: [ "*" ] +- apiGroups: + - "" + resources: + - configmaps + - pods + - nodes + verbs: + - get + - list + - watch +- apiGroups: + - analysis.crane.io + resources: + - "*" + verbs: + - "*" +- apiGroups: + - apps + resources: + - daemonsets + - deployments + - deployments/scale + - statefulsets + - statefulsets/scale + verbs: + - get + - list + - watch +- apiGroups: + - apps + resources: + - daemonsets/status + - deployments/status + - deployments/scale + - statefulsets/status + - statefulsets/scale + verbs: + - update +- apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - '*' +- apiGroups: + - autoscaling.crane.io + resources: + - '*' + verbs: + - '*' +- apiGroups: + - "" + resources: + - events + verbs: + - create + - patch + - update +- apiGroups: + - prediction.crane.io + resources: + - '*' + verbs: + - '*' +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: craned + namespace: crane-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: craned +subjects: +- kind: ServiceAccount + name: craned + namespace: crane-system --- - apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: @@ -17,6 +132,6 @@ roleRef: kind: ClusterRole name: craned subjects: - - kind: ServiceAccount - name: craned - namespace: crane-system \ No newline at end of file +- kind: ServiceAccount + name: craned + namespace: crane-system \ No newline at end of file diff --git a/pkg/controller/ehpa/hpa.go b/pkg/controller/ehpa/hpa.go index b43d7d6cd..69ac67fea 100644 --- a/pkg/controller/ehpa/hpa.go +++ b/pkg/controller/ehpa/hpa.go @@ -208,10 +208,30 @@ func (c *EffectiveHPAController) GetHPAMetrics(ctx context.Context, ehpa *autosc var metricIdentifier string var averageValue *resource.Quantity switch metric.Type { - case autoscalingv2.ResourceMetricSourceType: - metricIdentifier = utils.GetMetricIdentifier(metric, metric.Resource.Name.String()) + case autoscalingv2.ResourceMetricSourceType, autoscalingv2.ContainerResourceMetricSourceType: + var averageUtilization *int32 + var containerName string + if metric.Resource != nil { + if metric.Resource.Target.AverageUtilization != nil { + averageUtilization = metric.Resource.Target.AverageUtilization + } + if metric.Resource.Target.AverageValue != nil { + averageValue = metric.Resource.Target.AverageValue + } + } + if metric.ContainerResource != nil { + containerName = metric.ContainerResource.Container + if metric.ContainerResource.Target.AverageUtilization != nil { + averageUtilization = metric.ContainerResource.Target.AverageUtilization + } + if metric.ContainerResource.Target.AverageValue != nil { + averageValue = metric.ContainerResource.Target.AverageValue + } + } + // When use AverageUtilization in EffectiveHorizontalPodAutoscaler's metricSpec, convert to AverageValue - if metric.Resource.Target.AverageUtilization != nil { + if averageUtilization != nil { + metricName := utils.GetMetricName(metric) scale, _, err := utils.GetScale(ctx, c.RestMapper, c.ScaleClient, ehpa.Namespace, ehpa.Spec.ScaleTargetRef) if err != nil { return nil, err @@ -231,24 +251,21 @@ func (c *EffectiveHPAController) GetHPAMetrics(ctx context.Context, ehpa *autosc return nil, fmt.Errorf("failed to get available pods. ") } - requests, err := utils.CalculatePodRequests(availablePods, metric.Resource.Name) + requests, err := utils.CalculatePodRequests(availablePods, v1.ResourceName(metricName), containerName) if err != nil { return nil, err } - value := int64((float64(requests) * float64(*metric.Resource.Target.AverageUtilization) / 100) / float64(len(availablePods))) + value := int64((float64(requests) * float64(*averageUtilization) / 100) / float64(len(availablePods))) averageValue = resource.NewMilliQuantity(value, resource.DecimalSI) - } else { - averageValue = metric.Resource.Target.AverageValue } case autoscalingv2.ExternalMetricSourceType: - metricIdentifier = utils.GetMetricIdentifier(metric, metric.External.Metric.Name) averageValue = metric.External.Target.AverageValue case autoscalingv2.PodsMetricSourceType: - metricIdentifier = utils.GetMetricIdentifier(metric, metric.Pods.Metric.Name) averageValue = metric.Pods.Target.AverageValue } + metricIdentifier = utils.GetPredictionMetricIdentifier(metric) if metricIdentifier == "" { continue } diff --git a/pkg/controller/ehpa/predict.go b/pkg/controller/ehpa/predict.go index 97b2d4ac1..e128c5b18 100644 --- a/pkg/controller/ehpa/predict.go +++ b/pkg/controller/ehpa/predict.go @@ -133,25 +133,12 @@ func (c *EffectiveHPAController) NewPredictionObject(ehpa *autoscalingapi.Effect var predictionMetrics []predictionapi.PredictionMetric for _, metric := range ehpa.Spec.Metrics { - var metricName string - //get metricIdentifier by metric.Type and metricName - var metricIdentifier string - switch metric.Type { - case autoscalingv2.ResourceMetricSourceType: - metricName = metric.Resource.Name.String() - metricIdentifier = utils.GetMetricIdentifier(metric, metric.Resource.Name.String()) - case autoscalingv2.ExternalMetricSourceType: - metricName = metric.External.Metric.Name - metricIdentifier = utils.GetMetricIdentifier(metric, metric.External.Metric.Name) - case autoscalingv2.PodsMetricSourceType: - metricName = metric.Pods.Metric.Name - metricIdentifier = utils.GetMetricIdentifier(metric, metric.Pods.Metric.Name) - } - + metricIdentifier := utils.GetPredictionMetricIdentifier(metric) if metricIdentifier == "" { continue } + metricName := utils.GetMetricName(metric) //get matchLabels var matchLabels []string var metricRule *prometheus_adapter.MetricRule @@ -159,7 +146,7 @@ func (c *EffectiveHPAController) NewPredictionObject(ehpa *autoscalingapi.Effect // Supreme priority: annotation expressionQuery := utils.GetExpressionQueryAnnotation(metricIdentifier, ehpa.Annotations) if expressionQuery == "" { - var nameReg string + var podNameReg string // get metricRule from prometheus-adapter switch metric.Type { case autoscalingv2.ResourceMetricSourceType: @@ -169,7 +156,23 @@ func (c *EffectiveHPAController) NewPredictionObject(ehpa *autoscalingapi.Effect klog.Errorf("Got MetricRulesResource prometheus-adapter-resource Failed MetricName[%s]", metricName) } else { klog.V(4).Infof("Got MetricRulesResource prometheus-adapter-resource MetricMatches[%s] SeriesName[%s]", metricRule.MetricMatches, metricRule.SeriesName) - nameReg = utils.GetPodNameReg(ehpa.Spec.ScaleTargetRef.Name, ehpa.Spec.ScaleTargetRef.Kind) + podNameReg = utils.GetPodNameReg(ehpa.Spec.ScaleTargetRef.Name, ehpa.Spec.ScaleTargetRef.Kind) + } + } + case autoscalingv2.ContainerResourceMetricSourceType: + if len(mrs.MetricRulesResource) > 0 { + metricRule = prometheus_adapter.MatchMetricRule(mrs.MetricRulesResource, metricName) + if metricRule == nil { + klog.Errorf("Got MetricRulesResource prometheus-adapter-resource Failed MetricName[%s]", metricName) + } else { + klog.V(4).Infof("Got MetricRulesResource prometheus-adapter-resource MetricMatches[%s] SeriesName[%s]", metricRule.MetricMatches, metricRule.SeriesName) + podNameReg = utils.GetPodNameReg(ehpa.Spec.ScaleTargetRef.Name, ehpa.Spec.ScaleTargetRef.Kind) + // Compared to ResourceMetricSourceType, there is an additional container-name field + containerLabel := "container" + if metricRule.ContainerLabel != "" { + containerLabel = metricRule.ContainerLabel + } + matchLabels = append(matchLabels, utils.MapSortToArray(map[string]string{containerLabel: metric.ContainerResource.Container})...) } } case autoscalingv2.PodsMetricSourceType: @@ -179,12 +182,9 @@ func (c *EffectiveHPAController) NewPredictionObject(ehpa *autoscalingapi.Effect klog.Errorf("Got MetricRulesCustomer prometheus-adapter-customer Failed MetricName[%s]", metricName) } else { klog.V(4).Infof("Got MetricRulesCustomer prometheus-adapter-customer MetricMatches[%s] SeriesName[%s]", metricRule.MetricMatches, metricRule.SeriesName) - nameReg = utils.GetPodNameReg(ehpa.Spec.ScaleTargetRef.Name, ehpa.Spec.ScaleTargetRef.Kind) - + podNameReg = utils.GetPodNameReg(ehpa.Spec.ScaleTargetRef.Name, ehpa.Spec.ScaleTargetRef.Kind) if metric.Pods.Metric.Selector != nil { - for _, i := range utils.MapSortToArray(metric.Pods.Metric.Selector.MatchLabels) { - matchLabels = append(matchLabels, i) - } + matchLabels = append(matchLabels, utils.MapSortToArray(metric.Pods.Metric.Selector.MatchLabels)...) } } } @@ -196,18 +196,16 @@ func (c *EffectiveHPAController) NewPredictionObject(ehpa *autoscalingapi.Effect } else { klog.V(4).Infof("Got MetricRulesExternal prometheus-adapter-external MetricMatches[%s] SeriesName[%s]", metricRule.MetricMatches, metricRule.SeriesName) if metric.External.Metric.Selector != nil { - for _, i := range utils.MapSortToArray(metric.External.Metric.Selector.MatchLabels) { - matchLabels = append(matchLabels, i) - } + matchLabels = append(matchLabels, utils.MapSortToArray(metric.External.Metric.Selector.MatchLabels)...) } } } } if metricRule != nil { - // Second priority: get default expressionQuery + // Second priority: get prometheus-adapter expressionQuery var err error - expressionQuery, err = metricRule.QueryForSeries(ehpa.Namespace, nameReg, append(mrs.ExtensionLabels, matchLabels...)) + expressionQuery, err = metricRule.QueryForSeries(ehpa.Namespace, podNameReg, append(mrs.ExtensionLabels, matchLabels...)) if err != nil { klog.Errorf("Got promSelector prometheus-adapter %v %v", metricRule, err) } else { diff --git a/pkg/controller/recommendation/recommendation_checker.go b/pkg/controller/recommendation/recommendation_checker.go new file mode 100644 index 000000000..1d9a646e5 --- /dev/null +++ b/pkg/controller/recommendation/recommendation_checker.go @@ -0,0 +1,65 @@ +package recommendation + +import ( + "context" + "time" + + "k8s.io/klog/v2" + "sigs.k8s.io/controller-runtime/pkg/client" + + analysisv1alpha1 "github.com/gocrane/api/analysis/v1alpha1" + + "github.com/gocrane/crane/pkg/metrics" +) + +type Checker struct { + client.Client + MonitorInterval time.Duration + OutDateInterval time.Duration +} + +func (r Checker) Run(stopCh <-chan struct{}) { + go func() { + ticker := time.NewTicker(r.MonitorInterval) + defer ticker.Stop() + + for { + select { + case <-stopCh: + return + case <-ticker.C: + r.runChecker() + } + } + }() +} + +func (r Checker) runChecker() { + recommendList := &analysisv1alpha1.RecommendationList{} + err := r.Client.List(context.TODO(), recommendList, []client.ListOption{}...) + if err != nil { + klog.Errorf("Failed to list recommendation: %v", err) + } + + for _, recommend := range recommendList.Items { + updateStatus := "Updated" + if time.Now().Sub(recommend.Status.LastUpdateTime.Time) > r.OutDateInterval { + updateStatus = "OutDate" + } + + resultStatus := "Failed" + if len(recommend.Status.RecommendedInfo) != 0 || len(recommend.Status.RecommendedValue) != 0 { + resultStatus = "Success" + } + + metrics.RecommendationsStatus.With(map[string]string{ + "type": string(recommend.Spec.Type), + "apiversion": recommend.Spec.TargetRef.APIVersion, + "owner_kind": recommend.Spec.TargetRef.Kind, + "namespace": recommend.Spec.TargetRef.Namespace, + "owner_name": recommend.Spec.TargetRef.Name, + "update_status": updateStatus, + "result_status": resultStatus, + }).Set(time.Now().Sub(recommend.Status.LastUpdateTime.Time).Seconds()) + } +} diff --git a/pkg/controller/recommendation/recommendation_rule_controller.go b/pkg/controller/recommendation/recommendation_rule_controller.go index 2f6d8d082..feccea912 100644 --- a/pkg/controller/recommendation/recommendation_rule_controller.go +++ b/pkg/controller/recommendation/recommendation_rule_controller.go @@ -15,12 +15,16 @@ import ( "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" unstructuredv1 "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/discovery" "k8s.io/client-go/dynamic" + "k8s.io/client-go/dynamic/dynamicinformer" "k8s.io/client-go/kubernetes" "k8s.io/client-go/scale" + "k8s.io/client-go/tools/cache" "k8s.io/client-go/tools/record" "k8s.io/client-go/util/retry" "k8s.io/klog/v2" @@ -30,8 +34,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/predicate" analysisv1alph1 "github.com/gocrane/api/analysis/v1alpha1" - "github.com/gocrane/crane/pkg/known" + "github.com/gocrane/crane/pkg/metrics" "github.com/gocrane/crane/pkg/oom" predictormgr "github.com/gocrane/crane/pkg/predictor" "github.com/gocrane/crane/pkg/providers" @@ -53,6 +57,7 @@ type RecommendationRuleController struct { dynamicClient dynamic.Interface discoveryClient discovery.DiscoveryInterface Provider providers.History + dynamicLister DynamicLister } func (c *RecommendationRuleController) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { @@ -146,9 +151,10 @@ func (c *RecommendationRuleController) doReconcile(ctx context.Context, recommen keys = append(keys, k) } sort.Strings(keys) // sort key to get a certain order + recommendationIndex := NewRecommendationIndex(currRecommendations) for _, key := range keys { id := identities[key] - id.Recommendation = GetRecommendationFromIdentity(identities[key], currRecommendations) + id.Recommendation = recommendationIndex.GetRecommendation(id) identitiesArray = append(identitiesArray, id) } @@ -214,9 +220,11 @@ func (c *RecommendationRuleController) doReconcile(ctx context.Context, recommen for _, recommendation := range currRecommendations.Items { exist := false for _, id := range identitiesArray { - if recommendation.UID == id.Recommendation.UID { - exist = true - break + if id.Recommendation != nil { + if recommendation.UID == id.Recommendation.UID { + exist = true + break + } } } @@ -240,6 +248,8 @@ func (c *RecommendationRuleController) SetupWithManager(mgr ctrl.Manager) error c.kubeClient = kubernetes.NewForConfigOrDie(mgr.GetConfig()) c.discoveryClient = discovery.NewDiscoveryClientForConfigOrDie(mgr.GetConfig()) c.dynamicClient = dynamic.NewForConfigOrDie(mgr.GetConfig()) + dynamicInformerFactory := dynamicinformer.NewDynamicSharedInformerFactory(c.dynamicClient, 0) + c.dynamicLister = NewDynamicInformerLister(dynamicInformerFactory) return ctrl.NewControllerManagedBy(mgr). For(&analysisv1alph1.RecommendationRule{}, builder.WithPredicates(predicate.GenerationChangedPredicate{})). @@ -261,19 +271,19 @@ func (c *RecommendationRuleController) getIdentities(ctx context.Context, recomm var unstructureds []unstructuredv1.Unstructured if recommendationRule.Spec.NamespaceSelector.Any { - unstructuredList, err := c.dynamicClient.Resource(*gvr).List(ctx, metav1.ListOptions{}) + unstructuredList, err := c.dynamicLister.List(ctx, *gvr, "") if err != nil { return nil, err } - unstructureds = append(unstructureds, unstructuredList.Items...) + unstructureds = append(unstructureds, unstructuredList...) } else { for _, namespace := range recommendationRule.Spec.NamespaceSelector.MatchNames { - unstructuredList, err := c.dynamicClient.Resource(*gvr).Namespace(namespace).List(ctx, metav1.ListOptions{}) + unstructuredList, err := c.dynamicLister.List(ctx, *gvr, namespace) if err != nil { return nil, err } - unstructureds = append(unstructureds, unstructuredList.Items...) + unstructureds = append(unstructureds, unstructuredList...) } } @@ -309,6 +319,16 @@ func (c *RecommendationRuleController) getIdentities(ctx context.Context, recomm } } + for _, id := range identities { + metrics.SelectTargets.With(map[string]string{ + "type": id.Recommender, + "apiversion": id.APIVersion, + "owner_kind": id.Kind, + "namespace": id.Namespace, + "owner_name": id.Name, + }).Set(1) + } + return identities, nil } @@ -440,6 +460,7 @@ func executeIdentity(ctx context.Context, wg *sync.WaitGroup, recommenderMgr rec defer func() { if wg != nil { wg.Done() + metrics.RecommendationExecutionCounter.WithLabelValues(id.APIVersion, id.Kind, id.Namespace, id.Name, id.Recommender).Inc() } }() var message string @@ -515,3 +536,106 @@ func IsConvertFromAnalytics(recommendationRule *analysisv1alph1.RecommendationRu return false, "" } + +// DynamicLister is a lister for dynamic resources. +type DynamicLister interface { + // List returns a list of resources matching the given groupVersionResource. + List(ctx context.Context, gvk schema.GroupVersionResource, namespace string) ([]unstructuredv1.Unstructured, error) +} + +type dynamicInformerLister struct { + dynamicLister map[schema.GroupVersionResource]cache.GenericLister + dynamicInformerFactory dynamicinformer.DynamicSharedInformerFactory + stopCh <-chan struct{} +} + +func NewDynamicInformerLister(dynamicInformerFactory dynamicinformer.DynamicSharedInformerFactory) DynamicLister { + return &dynamicInformerLister{ + dynamicLister: map[schema.GroupVersionResource]cache.GenericLister{}, + dynamicInformerFactory: dynamicInformerFactory, + stopCh: make(chan struct{}), + } +} + +func (d *dynamicInformerLister) List(ctx context.Context, gvr schema.GroupVersionResource, namespace string) ([]unstructuredv1.Unstructured, error) { + var ( + objects []runtime.Object + err error + ) + + lister, exists := d.dynamicLister[gvr] + if !exists { + lister = d.dynamicInformerFactory.ForResource(gvr).Lister() + d.dynamicLister[gvr] = lister + d.dynamicInformerFactory.Start(d.stopCh) + if !d.dynamicInformerFactory.WaitForCacheSync(d.stopCh)[gvr] { + return nil, fmt.Errorf("failed to sync informer for %s", gvr) + } + } + if namespace != "" { + objects, err = lister.ByNamespace(namespace).List(labels.Everything()) + } else { + objects, err = lister.List(labels.Everything()) + } + if err != nil { + return nil, err + } + + var unstructuredObjects []unstructuredv1.Unstructured + for _, obj := range objects { + unstructuredObj, err := runtime.DefaultUnstructuredConverter.ToUnstructured(obj) + if err != nil { + return nil, err + } + unstructuredObjects = append(unstructuredObjects, unstructuredv1.Unstructured{Object: unstructuredObj}) + } + return unstructuredObjects, nil +} + +type IndexKey struct { + Namespace string + APIVersion string + Kind string + Name string + Recommender string +} + +type RecommendationIndex struct { + mtx sync.RWMutex + idx map[IndexKey]*analysisv1alph1.Recommendation +} + +func NewRecommendationIndex(recommendations analysisv1alph1.RecommendationList) *RecommendationIndex { + idx := make(map[IndexKey]*analysisv1alph1.Recommendation, len(recommendations.Items)) + for i := range recommendations.Items { + r := &recommendations.Items[i] + idx[createIndexKey(r)] = r + } + + return &RecommendationIndex{ + idx: idx, + } +} + +func createIndexKey(r *analysisv1alph1.Recommendation) IndexKey { + return IndexKey{ + Kind: r.Spec.TargetRef.Kind, + APIVersion: r.Spec.TargetRef.APIVersion, + Namespace: r.Spec.TargetRef.Namespace, + Name: r.Spec.TargetRef.Name, + Recommender: string(r.Spec.Type), + } +} + +func (idx *RecommendationIndex) GetRecommendation(id ObjectIdentity) *analysisv1alph1.Recommendation { + key := IndexKey{ + Kind: id.Kind, + APIVersion: id.APIVersion, + Namespace: id.Namespace, + Name: id.Name, + Recommender: id.Recommender, + } + idx.mtx.RLock() + defer idx.mtx.RUnlock() + return idx.idx[key] +} diff --git a/pkg/controller/recommendation/recommendation_rule_controller_test.go b/pkg/controller/recommendation/recommendation_rule_controller_test.go new file mode 100644 index 000000000..f96536fcd --- /dev/null +++ b/pkg/controller/recommendation/recommendation_rule_controller_test.go @@ -0,0 +1,115 @@ +package recommendation + +import ( + "reflect" + "testing" + + analysisv1alph1 "github.com/gocrane/api/analysis/v1alpha1" + corev1 "k8s.io/api/core/v1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestRecommendationIndex_GetRecommendation(t *testing.T) { + type fields struct { + recommendationList analysisv1alph1.RecommendationList + } + type args struct { + id ObjectIdentity + } + + tests := []struct { + name string + fields fields + args args + want *analysisv1alph1.Recommendation + }{ + { + name: "TestRecommendationIndex_GetRecommendation good case", + fields: fields{ + recommendationList: analysisv1alph1.RecommendationList{ + Items: []analysisv1alph1.Recommendation{ + { + ObjectMeta: v1.ObjectMeta{ + Name: "test-recommendation-rule", + Namespace: "test-namespace", + }, + Spec: analysisv1alph1.RecommendationSpec{ + TargetRef: corev1.ObjectReference{ + Namespace: "test-namespace", + Kind: "Deployment", + Name: "test-deployment-bar", + APIVersion: "app/v1", + }, + Type: analysisv1alph1.AnalysisTypeResource, + }, + }, + { + ObjectMeta: v1.ObjectMeta{ + Name: "test-recommendation-rule", + Namespace: "test-namespace", + }, + Spec: analysisv1alph1.RecommendationSpec{ + TargetRef: corev1.ObjectReference{ + Namespace: "test-namespace", + Kind: "Deployment", + Name: "test-deployment-foo", + APIVersion: "app/v1", + }, + Type: analysisv1alph1.AnalysisTypeResource, + }, + }, + }, + }, + }, + want: &analysisv1alph1.Recommendation{ + ObjectMeta: v1.ObjectMeta{ + Name: "test-recommendation-rule", + Namespace: "test-namespace", + }, + Spec: analysisv1alph1.RecommendationSpec{ + TargetRef: corev1.ObjectReference{ + Namespace: "test-namespace", + Kind: "Deployment", + Name: "test-deployment-bar", + APIVersion: "app/v1", + }, + Type: analysisv1alph1.AnalysisTypeResource, + }, + }, + args: args{ + id: ObjectIdentity{ + Name: "test-deployment-bar", + Namespace: "test-namespace", + APIVersion: "app/v1", + Kind: "Deployment", + Recommender: "Resource", + }, + }, + }, + { + name: "TestRecommendationIndex_GetRecommendation empty case", + fields: fields{ + recommendationList: analysisv1alph1.RecommendationList{ + Items: []analysisv1alph1.Recommendation{}, + }, + }, + args: args{ + id: ObjectIdentity{ + Name: "test-deployment-name", + Namespace: "test-namespace", + APIVersion: "app/v1", + Kind: "Deployment", + Recommender: "Resources", + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + idx := NewRecommendationIndex(tt.fields.recommendationList) + if got := idx.GetRecommendation(tt.args.id); !reflect.DeepEqual(got, tt.want) { + t.Errorf("GetRecommendation() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/pkg/controller/recommendation/updater.go b/pkg/controller/recommendation/updater.go index 1bc8178ab..7a2359096 100644 --- a/pkg/controller/recommendation/updater.go +++ b/pkg/controller/recommendation/updater.go @@ -99,7 +99,8 @@ func (c *RecommendationController) UpdateRecommendation(ctx context.Context, rec if needUpdate { unstructed.SetAnnotations(annotation) - err = c.Client.Update(ctx, unstructed) + //Convergence craned permissions + err = c.Client.Status().Update(ctx, unstructed) if err != nil { return false, fmt.Errorf("update target annotation failed: %v. ", err) } diff --git a/pkg/metrics/analysis.go b/pkg/metrics/analysis.go index 10b37cee2..e0078fe7f 100644 --- a/pkg/metrics/analysis.go +++ b/pkg/metrics/analysis.go @@ -6,6 +6,16 @@ import ( ) var ( + RecommendationExecutionCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "crane", + Subsystem: "analysis", + Name: "recommendation_execution_total", + Help: "The number of times Recommendation has been executed", + }, + []string{"apiversion", "owner_kind", "namespace", "owner_name", "type"}, + ) + ResourceRecommendation = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: "crane", @@ -25,8 +35,28 @@ var ( }, []string{"apiversion", "owner_kind", "namespace", "owner_name"}, ) + + SelectTargets = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: "crane", + Subsystem: "analysis", + Name: "select_targets", + Help: "The number of selected targets", + }, + []string{"type", "apiversion", "owner_kind", "namespace", "owner_name"}, + ) + + RecommendationsStatus = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: "crane", + Subsystem: "analysis", + Name: "recommendations_status", + Help: "The status of recommendations", + }, + []string{"type", "apiversion", "owner_kind", "namespace", "owner_name", "update_status", "result_status"}, + ) ) func init() { - metrics.Registry.MustRegister(ResourceRecommendation, ReplicasRecommendation) + metrics.Registry.MustRegister(RecommendationExecutionCounter, ResourceRecommendation, ReplicasRecommendation, SelectTargets, RecommendationsStatus) } diff --git a/pkg/metrics/metric_collector.go b/pkg/metrics/metric_collector.go index 819d3d1e6..348770d08 100644 --- a/pkg/metrics/metric_collector.go +++ b/pkg/metrics/metric_collector.go @@ -96,29 +96,7 @@ func (c *CraneMetricCollector) Collect(ch chan<- prometheus.Metric) { if err != nil { klog.Errorf("Failed to list ehpa: %v", err) } - var predictionMetrics []PredictionMetric for _, ehpa := range ehpaList.Items { - namespace := ehpa.Namespace - if ehpa.Spec.Prediction != nil { - var tsp predictionapi.TimeSeriesPrediction - tspName := "ehpa-" + ehpa.Name - - err := c.Get(context.TODO(), client.ObjectKey{Namespace: namespace, Name: tspName}, &tsp) - if err != nil { - klog.Errorf("Failed to get tsp: %v", err) - return - } - metricListTsp := c.getMetricsTsp(&tsp) - for _, metric := range metricListTsp { - if MetricContains(predictionMetrics, metric) { - continue - } - - ch <- prometheus.NewMetricWithTimestamp(metric.Timestamp, prometheus.MustNewConstMetric(metric.Desc, prometheus.GaugeValue, metric.MetricValue, metric.TargetKind, metric.TargetName, metric.TargetNamespace, metric.ResourceIdentifier, metric.Algorithm)) - predictionMetrics = append(predictionMetrics, metric) - } - } - if ehpa.Spec.Crons != nil { metricCron, err := c.getMetricsCron(&ehpa) if err != nil { @@ -130,6 +108,27 @@ func (c *CraneMetricCollector) Collect(ch chan<- prometheus.Metric) { } } } + + if utilfeature.DefaultFeatureGate.Enabled(features.CraneTimeSeriesPrediction) { + var tspList predictionapi.TimeSeriesPredictionList + err := c.List(context.TODO(), &tspList) + if err != nil { + klog.Errorf("Failed to list tsp: %v", err) + } + + var predictionMetrics []PredictionMetric + for _, tsp := range tspList.Items { + metricListTsp := c.getMetricsTsp(&tsp) + for _, metric := range metricListTsp { + if MetricContains(predictionMetrics, metric) { + continue + } + + ch <- prometheus.NewMetricWithTimestamp(metric.Timestamp, prometheus.MustNewConstMetric(metric.Desc, prometheus.GaugeValue, metric.MetricValue, metric.TargetKind, metric.TargetName, metric.TargetNamespace, metric.ResourceIdentifier, metric.Algorithm)) + predictionMetrics = append(predictionMetrics, metric) + } + } + } metricsRuleError := c.getMetricsRuleError() for _, i := range metricsRuleError { ch <- i diff --git a/pkg/prometheus-adapter/config_fetcher.go b/pkg/prometheus-adapter/config_fetcher.go index 6bc17e20c..fd616e8d5 100644 --- a/pkg/prometheus-adapter/config_fetcher.go +++ b/pkg/prometheus-adapter/config_fetcher.go @@ -151,7 +151,7 @@ func FlushRules(metricsDiscoveryConfig config.MetricsDiscoveryConfig, mapper met } if len(errStr) > 0 { - return fmt.Errorf(strings.Join(errStr, ",")) + return fmt.Errorf("%s", strings.Join(errStr, ",")) } return err } diff --git a/pkg/prometheus-adapter/expression.go b/pkg/prometheus-adapter/expression.go index b38ae9c91..e53355311 100644 --- a/pkg/prometheus-adapter/expression.go +++ b/pkg/prometheus-adapter/expression.go @@ -47,12 +47,13 @@ type MetricRules struct { } type MetricRule struct { - MetricMatches string - SeriesName string - ResConverter naming.ResourceConverter - Template *template.Template - Namespaced bool - LabelMatchers []string + MetricMatches string + SeriesName string + ResConverter naming.ResourceConverter + Template *template.Template + Namespaced bool + LabelMatchers []string + ContainerLabel string } type QueryTemplateArgs struct { @@ -129,7 +130,7 @@ func SetExtensionLabels(extensionLabels string) { } } -// GetMetricRuleResourceFromRules produces a MetricNamer for each rule in the given config. +// GetMetricRulesFromResourceRules produces a MetricNamer for each rule in the given config. func GetMetricRulesFromResourceRules(cfg config.ResourceRules, mapper meta.RESTMapper) (metricRules []MetricRule, metricRulesError []string, err error) { // get cpu MetricsQuery if cfg.CPU.ContainerQuery != "" { @@ -154,10 +155,11 @@ func GetMetricRulesFromResourceRules(cfg config.ResourceRules, mapper meta.RESTM klog.Errorf("unable to parse metrics query template %q: %v", cfg.CPU.ContainerQuery, err) } else { metricRules = append(metricRules, MetricRule{ - MetricMatches: "cpu", - ResConverter: resConverter, - Template: templ, - Namespaced: true, + MetricMatches: "cpu", + ResConverter: resConverter, + Template: templ, + Namespaced: true, + ContainerLabel: cfg.CPU.ContainerLabel, }) } } @@ -190,10 +192,11 @@ func GetMetricRulesFromResourceRules(cfg config.ResourceRules, mapper meta.RESTM klog.Errorf("unable to parse metrics query template %q: %v", cfg.Memory.ContainerQuery, err) } else { metricRules = append(metricRules, MetricRule{ - MetricMatches: "memory", - ResConverter: resConverter, - Template: templ, - Namespaced: true, + MetricMatches: "memory", + ResConverter: resConverter, + Template: templ, + Namespaced: true, + ContainerLabel: cfg.Memory.ContainerLabel, }) } } diff --git a/pkg/providers/config.go b/pkg/providers/config.go index 054b8696d..32fdef34d 100644 --- a/pkg/providers/config.go +++ b/pkg/providers/config.go @@ -13,6 +13,7 @@ type PromConfig struct { AdapterConfigMapKey string AdapterConfig string AdapterExtensionLabels string + ExtensionLabels string Timeout time.Duration KeepAlive time.Duration InsecureSkipVerify bool diff --git a/pkg/server/store/secret/cluster.go b/pkg/server/store/secret/cluster.go index c47c5d66a..368304c3f 100644 --- a/pkg/server/store/secret/cluster.go +++ b/pkg/server/store/secret/cluster.go @@ -48,9 +48,8 @@ func updateClusterInSecret(cluster *store.Cluster, secret *v1.Secret) error { } } -func deleteClusterInSecret(clusterid string, secret *v1.Secret) error { +func deleteClusterInSecret(clusterid string, secret *v1.Secret) { delete(secret.Data, clusterid) - return nil } func getClusterInSecret(clusterid string, secret *v1.Secret) (*store.Cluster, error) { @@ -129,10 +128,7 @@ func (c *clusters) DeleteCluster(ctx context.Context, clusterid string) error { if err != nil { return err } - err = deleteClusterInSecret(clusterid, secret) - if err != nil { - return err - } + deleteClusterInSecret(clusterid, secret) _, err = c.writeSecretStore(ctx, secret) return err } diff --git a/pkg/utils/ehpa.go b/pkg/utils/ehpa.go index ead7b081f..dbdd0f2ed 100644 --- a/pkg/utils/ehpa.go +++ b/pkg/utils/ehpa.go @@ -39,7 +39,7 @@ func IsEHPACronEnabled(ehpa *autoscalingapi.EffectiveHorizontalPodAutoscaler) bo // GetPredictionMetricName return metric name used by prediction func GetPredictionMetricName(sourceType autoscalingv2.MetricSourceType) (metricName string) { switch sourceType { - case autoscalingv2.ResourceMetricSourceType, autoscalingv2.PodsMetricSourceType, autoscalingv2.ExternalMetricSourceType: + case autoscalingv2.ResourceMetricSourceType, autoscalingv2.ContainerResourceMetricSourceType, autoscalingv2.PodsMetricSourceType, autoscalingv2.ExternalMetricSourceType: metricName = known.MetricNamePrediction } @@ -51,19 +51,36 @@ func GetCronMetricName() string { return known.MetricNameCron } -// GetGeneralPredictionMetricName return metric name used by prediction -func GetMetricIdentifier(metric autoscalingv2.MetricSpec, name string) string { +func GetMetricName(metric autoscalingv2.MetricSpec) string { + switch metric.Type { + case autoscalingv2.PodsMetricSourceType: + return metric.Pods.Metric.Name + case autoscalingv2.ResourceMetricSourceType: + return metric.Resource.Name.String() + case autoscalingv2.ContainerResourceMetricSourceType: + return metric.ContainerResource.Name.String() + case autoscalingv2.ExternalMetricSourceType: + return metric.External.Metric.Name + default: + return "" + } +} + +// GetPredictionMetricIdentifier return metric name used by prediction +func GetPredictionMetricIdentifier(metric autoscalingv2.MetricSpec) string { var prefix string switch metric.Type { case autoscalingv2.PodsMetricSourceType: prefix = "pods" case autoscalingv2.ResourceMetricSourceType: prefix = "resource" + case autoscalingv2.ContainerResourceMetricSourceType: + prefix = "container-resource" case autoscalingv2.ExternalMetricSourceType: prefix = "external" } - return fmt.Sprintf("%s.%s", prefix, name) + return fmt.Sprintf("%s.%s", prefix, GetMetricName(metric)) } // GetExpressionQueryAnnotation return metric query from annotation by metricName @@ -95,7 +112,7 @@ func IsExpressionQueryAnnotationEnabled(metricIdentifier string, annotations map return false } -// GetExpressionQuery return metric query +// GetExpressionQueryDefault return default metric query func GetExpressionQueryDefault(metric autoscalingv2.MetricSpec, namespace string, name string, kind string) string { var expressionQuery string switch metric.Type { @@ -106,6 +123,13 @@ func GetExpressionQueryDefault(metric autoscalingv2.MetricSpec, namespace string case "memory": expressionQuery = GetWorkloadMemUsageExpression(namespace, name, kind) } + case autoscalingv2.ContainerResourceMetricSourceType: + switch metric.ContainerResource.Name { + case "cpu": + expressionQuery = GetContainerCpuUsageExpression(namespace, name, kind, metric.ContainerResource.Container) + case "memory": + expressionQuery = GetContainerMemUsageExpression(namespace, name, kind, metric.ContainerResource.Container) + } case autoscalingv2.PodsMetricSourceType: var labels []string if metric.Pods.Metric.Selector != nil { diff --git a/pkg/utils/expression_prom_default.go b/pkg/utils/expression_prom_default.go index 9d9f00c67..c97faf863 100644 --- a/pkg/utils/expression_prom_default.go +++ b/pkg/utils/expression_prom_default.go @@ -2,46 +2,48 @@ package utils import ( "fmt" + "strings" ) // todo: later we change these templates to configurable like prometheus-adapter const ( + ExtensionLabelsHolder = `EXTENSION_LABELS_HOLDER` // WorkloadCpuUsageExprTemplate is used to query workload cpu usage by promql, param is namespace,workload-name,duration str - WorkloadCpuUsageExprTemplate = `sum(irate(container_cpu_usage_seconds_total{namespace="%s",pod=~"%s",container!=""}[%s]))` + WorkloadCpuUsageExprTemplate = `sum(irate(container_cpu_usage_seconds_total{namespace="%s",pod=~"%s",container!=""EXTENSION_LABELS_HOLDER}[%s]))` // WorkloadMemUsageExprTemplate is used to query workload mem usage by promql, param is namespace, workload-name - WorkloadMemUsageExprTemplate = `sum(container_memory_working_set_bytes{namespace="%s",pod=~"%s",container!=""})` + WorkloadMemUsageExprTemplate = `sum(container_memory_working_set_bytes{namespace="%s",pod=~"%s",container!=""EXTENSION_LABELS_HOLDER})` // following is node exporter metric for node cpu/memory usage // NodeCpuUsageExprTemplate is used to query node cpu usage by promql, param is node name which prometheus scrape, duration str - NodeCpuUsageExprTemplate = `sum(count(node_cpu_seconds_total{mode="idle",instance=~"(%s)(:\\d+)?"}) by (mode, cpu)) - sum(irate(node_cpu_seconds_total{mode="idle",instance=~"(%s)(:\\d+)?"}[%s]))` + NodeCpuUsageExprTemplate = `sum(count(node_cpu_seconds_total{mode="idle",instance=~"(%s)(:\\d+)?"EXTENSION_LABELS_HOLDER}) by (mode, cpu)) - sum(irate(node_cpu_seconds_total{mode="idle",instance=~"(%s)(:\\d+)?"EXTENSION_LABELS_HOLDER}[%s]))` // NodeMemUsageExprTemplate is used to query node memory usage by promql, param is node name, node name which prometheus scrape - NodeMemUsageExprTemplate = `sum(node_memory_MemTotal_bytes{instance=~"(%s)(:\\d+)?"} - node_memory_MemAvailable_bytes{instance=~"(%s)(:\\d+)?"})` + NodeMemUsageExprTemplate = `sum(node_memory_MemTotal_bytes{instance=~"(%s)(:\\d+)?EXTENSION_LABELS_HOLDER"} - node_memory_MemAvailable_bytes{instance=~"(%s)(:\\d+)?"EXTENSION_LABELS_HOLDER})` // NodeCpuRequestUtilizationExprTemplate is used to query node cpu request utilization by promql, param is node name, node name which prometheus scrape - NodeCpuRequestUtilizationExprTemplate = `sum(kube_pod_container_resource_requests{node="%s", resource="cpu", unit="core"} * on (node) group_left() max(kube_node_labels{label_beta_kubernetes_io_instance_type!~"eklet", label_node_kubernetes_io_instance_type!~"eklet"}) by (node)) by (node) / sum(kube_node_status_capacity{node="%s", resource="cpu", unit="core"} * on (node) group_left() max(kube_node_labels{label_beta_kubernetes_io_instance_type!~"eklet", label_node_kubernetes_io_instance_type!~"eklet"}) by (node)) by (node) ` + NodeCpuRequestUtilizationExprTemplate = `sum(kube_pod_container_resource_requests{node="%s", resource="cpu", unit="core"EXTENSION_LABELS_HOLDER} * on (node) group_left() max(kube_node_labels{label_beta_kubernetes_io_instance_type!~"eklet", label_node_kubernetes_io_instance_type!~"eklet"EXTENSION_LABELS_HOLDER}) by (node)) by (node) / sum(kube_node_status_capacity{node="%s", resource="cpu", unit="core"EXTENSION_LABELS_HOLDER} * on (node) group_left() max(kube_node_labels{label_beta_kubernetes_io_instance_type!~"eklet", label_node_kubernetes_io_instance_type!~"eklet"EXTENSION_LABELS_HOLDER}) by (node)) by (node) ` // NodeMemRequestUtilizationExprTemplate is used to query node memory request utilization by promql, param is node name, node name which prometheus scrape - NodeMemRequestUtilizationExprTemplate = `sum(kube_pod_container_resource_requests{node="%s", resource="memory", unit="byte", namespace!=""} * on (node) group_left() max(kube_node_labels{label_beta_kubernetes_io_instance_type!~"eklet", label_node_kubernetes_io_instance_type!~"eklet"}) by (node)) by (node) / sum(kube_node_status_capacity{node="%s", resource="memory", unit="byte"} * on (node) group_left() max(kube_node_labels{label_beta_kubernetes_io_instance_type!~"eklet", label_node_kubernetes_io_instance_type!~"eklet"}) by (node)) by (node) ` + NodeMemRequestUtilizationExprTemplate = `sum(kube_pod_container_resource_requests{node="%s", resource="memory", unit="byte", namespace!=""EXTENSION_LABELS_HOLDER} * on (node) group_left() max(kube_node_labels{label_beta_kubernetes_io_instance_type!~"eklet", label_node_kubernetes_io_instance_type!~"eklet"EXTENSION_LABELS_HOLDER}) by (node)) by (node) / sum(kube_node_status_capacity{node="%s", resource="memory", unit="byte"EXTENSION_LABELS_HOLDER} * on (node) group_left() max(kube_node_labels{label_beta_kubernetes_io_instance_type!~"eklet", label_node_kubernetes_io_instance_type!~"eklet"EXTENSION_LABELS_HOLDER}) by (node)) by (node) ` // NodeCpuUsageUtilizationExprTemplate is used to query node memory usage utilization by promql, param is node name, node name which prometheus scrape - NodeCpuUsageUtilizationExprTemplate = `sum(label_replace(irate(container_cpu_usage_seconds_total{instance="%s", container!="POD", container!="",image!=""}[1h]), "node", "$1", "instance", "(^[^:]+)") * on (node) group_left() max(kube_node_labels{label_beta_kubernetes_io_instance_type!~"eklet", label_node_kubernetes_io_instance_type!~"eklet"}) by (node)) by (node) / sum(kube_node_status_capacity{node="%s", resource="cpu", unit="core"} * on (node) group_left() max(kube_node_labels{label_beta_kubernetes_io_instance_type!~"eklet", label_node_kubernetes_io_instance_type!~"eklet"}) by (node)) by (node) ` + NodeCpuUsageUtilizationExprTemplate = `sum(label_replace(irate(container_cpu_usage_seconds_total{instance="%s", container!="POD", container!="",image!=""EXTENSION_LABELS_HOLDER}[1h]), "node", "$1", "instance", "(^[^:]+)") * on (node) group_left() max(kube_node_labels{label_beta_kubernetes_io_instance_type!~"eklet", label_node_kubernetes_io_instance_type!~"eklet"EXTENSION_LABELS_HOLDER}) by (node)) by (node) / sum(kube_node_status_capacity{node="%s", resource="cpu", unit="core"EXTENSION_LABELS_HOLDER} * on (node) group_left() max(kube_node_labels{label_beta_kubernetes_io_instance_type!~"eklet", label_node_kubernetes_io_instance_type!~"eklet"EXTENSION_LABELS_HOLDER}) by (node)) by (node) ` // NodeMemUsageUtilizationExprTemplate is used to query node memory usage utilization by promql, param is node name, node name which prometheus scrape - NodeMemUsageUtilizationExprTemplate = `sum(label_replace(container_memory_usage_bytes{instance="%s", namespace!="",container!="POD", container!="",image!=""}, "node", "$1", "instance", "(^[^:]+)") * on (node) group_left() max(kube_node_labels{label_beta_kubernetes_io_instance_type!~"eklet", label_node_kubernetes_io_instance_type!~"eklet"}) by (node)) by (node) / sum(kube_node_status_capacity{node="%s", resource="memory", unit="byte"} * on (node) group_left() max(kube_node_labels{label_beta_kubernetes_io_instance_type!~"eklet", label_node_kubernetes_io_instance_type!~"eklet"}) by (node)) by (node) ` + NodeMemUsageUtilizationExprTemplate = `sum(label_replace(container_memory_usage_bytes{instance="%s", namespace!="",container!="POD", container!="",image!=""EXTENSION_LABELS_HOLDER}, "node", "$1", "instance", "(^[^:]+)") * on (node) group_left() max(kube_node_labels{label_beta_kubernetes_io_instance_type!~"eklet", label_node_kubernetes_io_instance_type!~"eklet"EXTENSION_LABELS_HOLDER}) by (node)) by (node) / sum(kube_node_status_capacity{node="%s", resource="memory", unit="byte"EXTENSION_LABELS_HOLDER} * on (node) group_left() max(kube_node_labels{label_beta_kubernetes_io_instance_type!~"eklet", label_node_kubernetes_io_instance_type!~"eklet"EXTENSION_LABELS_HOLDER}) by (node)) by (node) ` // PodCpuUsageExprTemplate is used to query pod cpu usage by promql, param is namespace,pod, duration str - PodCpuUsageExprTemplate = `sum(irate(container_cpu_usage_seconds_total{container!="POD",namespace="%s",pod="%s"}[%s]))` + PodCpuUsageExprTemplate = `sum(irate(container_cpu_usage_seconds_total{container!="POD",namespace="%s",pod="%s"EXTENSION_LABELS_HOLDER}[%s]))` // PodMemUsageExprTemplate is used to query pod cpu usage by promql, param is namespace,pod - PodMemUsageExprTemplate = `sum(container_memory_working_set_bytes{container!="POD",namespace="%s",pod="%s"})` + PodMemUsageExprTemplate = `sum(container_memory_working_set_bytes{container!="POD",namespace="%s",pod="%s"EXTENSION_LABELS_HOLDER})` // ContainerCpuUsageExprTemplate is used to query container cpu usage by promql, param is namespace,pod,container duration str - ContainerCpuUsageExprTemplate = `irate(container_cpu_usage_seconds_total{container!="POD",namespace="%s",pod=~"%s",container="%s"}[%s])` + ContainerCpuUsageExprTemplate = `irate(container_cpu_usage_seconds_total{container!="POD",namespace="%s",pod=~"%s",container="%s"EXTENSION_LABELS_HOLDER}[%s])` // ContainerMemUsageExprTemplate is used to query container cpu usage by promql, param is namespace,pod,container - ContainerMemUsageExprTemplate = `container_memory_working_set_bytes{container!="POD",namespace="%s",pod=~"%s",container="%s"}` + ContainerMemUsageExprTemplate = `container_memory_working_set_bytes{container!="POD",namespace="%s",pod=~"%s",container="%s"EXTENSION_LABELS_HOLDER}` - CustomerExprTemplate = `sum(%s{%s})` + CustomerExprTemplate = `sum(%s{%sEXTENSION_LABELS_HOLDER})` // Container network cumulative count of bytes received - queryFmtNetReceiveBytes = `sum(rate(container_network_receive_bytes_total{namespace="%s",pod=~"%s",container!=""}[3m]))` + queryFmtNetReceiveBytes = `sum(rate(container_network_receive_bytes_total{namespace="%s",pod=~"%s",container!=""EXTENSION_LABELS_HOLDER}[3m]))` // Container network cumulative count of bytes transmitted - queryFmtNetTransferBytes = `sum(rate(container_network_transmit_bytes_total{namespace="%s",pod=~"%s",container!=""}[3m]))` + queryFmtNetTransferBytes = `sum(rate(container_network_transmit_bytes_total{namespace="%s",pod=~"%s",container!=""EXTENSION_LABELS_HOLDER}[3m]))` ) const ( @@ -51,6 +53,31 @@ const ( PostRegMatchesPodStatefulset = `[0-9]+$` ) +var ExtensionLabelArray []string +var extensionLabelsString string + +func SetExtensionLabels(extensionLabels string) { + if extensionLabels != "" { + for _, label := range strings.Split(extensionLabels, ",") { + ExtensionLabelArray = append(ExtensionLabelArray, label) + } + + extensionLabelsString = "," + for index, label := range ExtensionLabelArray { + labelArr := strings.Split(label, "=") + if len(labelArr) != 2 { + // skip the invalid kv + continue + } + + extensionLabelsString += fmt.Sprintf("%s=\"%s\"", labelArr[0], labelArr[1]) + if index != len(ExtensionLabelArray)-1 { + extensionLabelsString += "," + } + } + } +} + func GetPodNameReg(resourceName string, resourceType string) string { switch resourceType { case "DaemonSet": @@ -66,61 +93,66 @@ func GetPodNameReg(resourceName string, resourceType string) string { } func GetCustomerExpression(metricName string, labels string) string { - return fmt.Sprintf(CustomerExprTemplate, metricName, labels) + return fmtSprintfInternal(CustomerExprTemplate, metricName, labels) } func GetWorkloadCpuUsageExpression(namespace string, name string, kind string) string { - return fmt.Sprintf(WorkloadCpuUsageExprTemplate, namespace, GetPodNameReg(name, kind), "3m") + return fmtSprintfInternal(WorkloadCpuUsageExprTemplate, namespace, GetPodNameReg(name, kind), "3m") } func GetWorkloadMemUsageExpression(namespace string, name string, kind string) string { - return fmt.Sprintf(WorkloadMemUsageExprTemplate, namespace, GetPodNameReg(name, kind)) + return fmtSprintfInternal(WorkloadMemUsageExprTemplate, namespace, GetPodNameReg(name, kind)) } func GetContainerCpuUsageExpression(namespace string, workloadName string, kind string, containerName string) string { - return fmt.Sprintf(ContainerCpuUsageExprTemplate, namespace, GetPodNameReg(workloadName, kind), containerName, "3m") + return fmtSprintfInternal(ContainerCpuUsageExprTemplate, namespace, GetPodNameReg(workloadName, kind), containerName, "3m") } func GetContainerMemUsageExpression(namespace string, workloadName string, kind string, containerName string) string { - return fmt.Sprintf(ContainerMemUsageExprTemplate, namespace, GetPodNameReg(workloadName, kind), containerName) + return fmtSprintfInternal(ContainerMemUsageExprTemplate, namespace, GetPodNameReg(workloadName, kind), containerName) } func GetPodCpuUsageExpression(namespace string, name string) string { - return fmt.Sprintf(PodCpuUsageExprTemplate, namespace, name, "3m") + return fmtSprintfInternal(PodCpuUsageExprTemplate, namespace, name, "3m") } func GetPodMemUsageExpression(namespace string, name string) string { - return fmt.Sprintf(PodMemUsageExprTemplate, namespace, name) + return fmtSprintfInternal(PodMemUsageExprTemplate, namespace, name) } func GetNodeCpuUsageExpression(nodeName string) string { - return fmt.Sprintf(NodeCpuUsageExprTemplate, nodeName, nodeName, "3m") + return fmtSprintfInternal(NodeCpuUsageExprTemplate, nodeName, nodeName, "3m") } func GetNodeMemUsageExpression(nodeName string) string { - return fmt.Sprintf(NodeMemUsageExprTemplate, nodeName, nodeName) + return fmtSprintfInternal(NodeMemUsageExprTemplate, nodeName, nodeName) } func GetNodeCpuRequestUtilizationExpression(nodeName string) string { - return fmt.Sprintf(NodeCpuRequestUtilizationExprTemplate, nodeName, nodeName) + return fmtSprintfInternal(NodeCpuRequestUtilizationExprTemplate, nodeName, nodeName) } func GetNodeMemRequestUtilizationExpression(nodeName string) string { - return fmt.Sprintf(NodeMemRequestUtilizationExprTemplate, nodeName, nodeName) + return fmtSprintfInternal(NodeMemRequestUtilizationExprTemplate, nodeName, nodeName) } func GetNodeCpuUsageUtilizationExpression(nodeName string) string { - return fmt.Sprintf(NodeCpuUsageUtilizationExprTemplate, nodeName, nodeName) + return fmtSprintfInternal(NodeCpuUsageUtilizationExprTemplate, nodeName, nodeName) } func GetNodeMemUsageUtilizationExpression(nodeName string) string { - return fmt.Sprintf(NodeMemUsageUtilizationExprTemplate, nodeName, nodeName) + return fmtSprintfInternal(NodeMemUsageUtilizationExprTemplate, nodeName, nodeName) } func GetWorkloadNetReceiveBytesExpression(namespace string, name string, kind string) string { - return fmt.Sprintf(queryFmtNetReceiveBytes, namespace, GetPodNameReg(name, kind)) + return fmtSprintfInternal(queryFmtNetReceiveBytes, namespace, GetPodNameReg(name, kind)) } func GetWorkloadNetTransferBytesExpression(namespace string, name string, kind string) string { - return fmt.Sprintf(queryFmtNetTransferBytes, namespace, GetPodNameReg(name, kind)) + return fmtSprintfInternal(queryFmtNetTransferBytes, namespace, GetPodNameReg(name, kind)) +} + +func fmtSprintfInternal(format string, a ...interface{}) string { + formatReplaced := strings.ReplaceAll(format, ExtensionLabelsHolder, extensionLabelsString) + return fmt.Sprintf(formatReplaced, a...) } diff --git a/pkg/utils/pod.go b/pkg/utils/pod.go index eef7ea3e0..929cfa0f2 100644 --- a/pkg/utils/pod.go +++ b/pkg/utils/pod.go @@ -110,11 +110,14 @@ func EvictPodWithGracePeriod(client clientset.Interface, pod *v1.Pod, gracePerio return client.CoreV1().Pods(pod.Namespace).EvictV1beta1(context.Background(), e) } -// CalculatePodRequests sum request total from pods -func CalculatePodRequests(pods []v1.Pod, resource v1.ResourceName) (int64, error) { +// CalculatePodRequests sum request total from pods. If the containerName is specified, the total amount of requests for that container will be calculated. +func CalculatePodRequests(pods []v1.Pod, resource v1.ResourceName, containerName string) (int64, error) { var requests int64 for _, pod := range pods { for _, c := range pod.Spec.Containers { + if containerName != "" && c.Name != containerName { + continue + } if containerRequest, ok := c.Resources.Requests[resource]; ok { requests += containerRequest.MilliValue() } else { diff --git a/pkg/utils/pod_test.go b/pkg/utils/pod_test.go index d1e64fd39..badeecdcf 100644 --- a/pkg/utils/pod_test.go +++ b/pkg/utils/pod_test.go @@ -61,9 +61,10 @@ func TestCalculatePodRequests(t *testing.T) { } tests := []struct { - description string - resource v1.ResourceName - expect int64 + description string + resource v1.ResourceName + containerName string + expect int64 }{ { description: "calculate cpu request total", @@ -75,10 +76,22 @@ func TestCalculatePodRequests(t *testing.T) { resource: v1.ResourceMemory, expect: 60000, }, + { + description: "calculate cpu request total of container1", + resource: v1.ResourceCPU, + containerName: "container1", + expect: 3000, + }, + { + description: "calculate memory request total of container1", + resource: v1.ResourceMemory, + containerName: "container1", + expect: 30000, + }, } for _, test := range tests { - requests, err := CalculatePodRequests(pods, test.resource) + requests, err := CalculatePodRequests(pods, test.resource, test.containerName) if err != nil { t.Errorf("Failed to calculatePodRequests: %v", err) }