diff --git a/bundle/manifests/windows-machine-config-operator.clusterserviceversion.yaml b/bundle/manifests/windows-machine-config-operator.clusterserviceversion.yaml index 8bcb5e55b5..4b99b58fad 100644 --- a/bundle/manifests/windows-machine-config-operator.clusterserviceversion.yaml +++ b/bundle/manifests/windows-machine-config-operator.clusterserviceversion.yaml @@ -189,16 +189,6 @@ spec: - get - patch - update - - apiGroups: - - "" - resources: - - endpoints - verbs: - - create - - delete - - get - - patch - - update - apiGroups: - "" resources: @@ -211,6 +201,8 @@ spec: - namespaces verbs: - get + - list + - watch - apiGroups: - "" resources: diff --git a/cmd/operator/main.go b/cmd/operator/main.go index 7a50b1dd96..5c6a3c2ba3 100644 --- a/cmd/operator/main.go +++ b/cmd/operator/main.go @@ -27,7 +27,6 @@ import ( "github.com/openshift/windows-machine-config-operator/controllers" "github.com/openshift/windows-machine-config-operator/pkg/cluster" - "github.com/openshift/windows-machine-config-operator/pkg/metrics" "github.com/openshift/windows-machine-config-operator/pkg/nodeconfig/payload" "github.com/openshift/windows-machine-config-operator/pkg/servicescm" "github.com/openshift/windows-machine-config-operator/pkg/windows" @@ -266,22 +265,20 @@ func main() { os.Exit(1) } - //+kubebuilder:scaffold:builder - // The above marker tells kubebuilder that this is where the SetupWithManager function should be inserted when new - // controllers are generated by Operator SDK. - - metricsConfig, err := metrics.NewConfig(mgr, cfg, watchNamespace) + mReconciler, err := controllers.NewMetricReconciler(mgr, clusterConfig, cfg, watchNamespace) if err != nil { - setupLog.Error(err, "failed to create MetricsConfig object") + setupLog.Error(err, "unable to create metrics reconciler") os.Exit(1) } - - // Configure the metric resources - if err := metricsConfig.Configure(ctx); err != nil { - setupLog.Error(err, "error setting up metrics") + if err = mReconciler.SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "Metrics") os.Exit(1) } + //+kubebuilder:scaffold:builder + // The above marker tells kubebuilder that this is where the SetupWithManager function should be inserted when new + // controllers are generated by Operator SDK. + // Create the singleton Windows services ConfigMap if err := configMapReconciler.EnsureServicesConfigMapExists(); err != nil { setupLog.Error(err, "error ensuring object exists", "singleton", types.NamespacedName{Namespace: watchNamespace, diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 036b86ef71..dc9896e8ed 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -31,16 +31,6 @@ rules: - get - patch - update -- apiGroups: - - "" - resources: - - endpoints - verbs: - - create - - delete - - get - - patch - - update - apiGroups: - "" resources: @@ -53,6 +43,8 @@ rules: - namespaces verbs: - get + - list + - watch - apiGroups: - "" resources: diff --git a/controllers/configmap_controller.go b/controllers/configmap_controller.go index 1e5ab5ae43..33e325c19f 100644 --- a/controllers/configmap_controller.go +++ b/controllers/configmap_controller.go @@ -46,7 +46,6 @@ import ( "github.com/openshift/windows-machine-config-operator/pkg/crypto" "github.com/openshift/windows-machine-config-operator/pkg/ignition" "github.com/openshift/windows-machine-config-operator/pkg/instance" - "github.com/openshift/windows-machine-config-operator/pkg/metrics" "github.com/openshift/windows-machine-config-operator/pkg/nodeconfig" "github.com/openshift/windows-machine-config-operator/pkg/patch" "github.com/openshift/windows-machine-config-operator/pkg/secrets" @@ -93,12 +92,6 @@ func NewConfigMapReconciler(mgr manager.Manager, clusterConfig cluster.Config, w return nil, fmt.Errorf("error creating kubernetes clientset: %w", err) } - // Initialize prometheus configuration - pc, err := metrics.NewPrometheusNodeConfig(clientset, watchNamespace) - if err != nil { - return nil, fmt.Errorf("unable to initialize Prometheus configuration: %w", err) - } - directClient, err := client.New(mgr.GetConfig(), client.Options{Scheme: mgr.GetScheme()}) if err != nil { return nil, err @@ -119,14 +112,13 @@ func NewConfigMapReconciler(mgr manager.Manager, clusterConfig cluster.Config, w return &ConfigMapReconciler{ instanceReconciler: instanceReconciler{ - client: mgr.GetClient(), - k8sclientset: clientset, - clusterServiceCIDR: clusterConfig.Network().GetServiceCIDR(), - log: ctrl.Log.WithName("controllers").WithName(ConfigMapController), - watchNamespace: watchNamespace, - recorder: mgr.GetEventRecorderFor(ConfigMapController), - prometheusNodeConfig: pc, - platform: clusterConfig.Platform(), + client: mgr.GetClient(), + k8sclientset: clientset, + clusterServiceCIDR: clusterConfig.Network().GetServiceCIDR(), + log: ctrl.Log.WithName("controllers").WithName(ConfigMapController), + watchNamespace: watchNamespace, + recorder: mgr.GetEventRecorderFor(ConfigMapController), + platform: clusterConfig.Platform(), }, servicesManifest: svcData, proxyEnabled: proxyEnabled, @@ -288,10 +280,6 @@ func (r *ConfigMapReconciler) reconcileNodes(ctx context.Context, windowsInstanc return fmt.Errorf("error removing undesired nodes from cluster: %w", err) } - // Once all the proper Nodes are in the cluster, configure the prometheus endpoints. - if err := r.prometheusNodeConfig.Configure(); err != nil { - return fmt.Errorf("unable to configure Prometheus: %w", err) - } return nil } diff --git a/controllers/controllers.go b/controllers/controllers.go index c9386bc91e..15e78bb911 100644 --- a/controllers/controllers.go +++ b/controllers/controllers.go @@ -21,7 +21,6 @@ import ( "github.com/openshift/windows-machine-config-operator/pkg/crypto" "github.com/openshift/windows-machine-config-operator/pkg/instance" "github.com/openshift/windows-machine-config-operator/pkg/metadata" - "github.com/openshift/windows-machine-config-operator/pkg/metrics" "github.com/openshift/windows-machine-config-operator/pkg/nodeconfig" "github.com/openshift/windows-machine-config-operator/pkg/secrets" "github.com/openshift/windows-machine-config-operator/version" @@ -51,8 +50,6 @@ type instanceReconciler struct { watchNamespace string // signer is a signer created from the user's private key signer ssh.Signer - // prometheusNodeConfig stores information required to configure Prometheus - prometheusNodeConfig *metrics.PrometheusNodeConfig // recorder to generate events recorder record.EventRecorder // platform indicates the cloud on which the cluster is running diff --git a/controllers/metric_controller.go b/controllers/metric_controller.go new file mode 100644 index 0000000000..f6750c414b --- /dev/null +++ b/controllers/metric_controller.go @@ -0,0 +1,262 @@ +package controllers + +import ( + "context" + "fmt" + "reflect" + "strconv" + + monv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + monclient "github.com/prometheus-operator/prometheus-operator/pkg/client/versioned/typed/monitoring/v1" + "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/predicate" + + "github.com/openshift/windows-machine-config-operator/pkg/cluster" + "github.com/openshift/windows-machine-config-operator/pkg/condition" + "github.com/openshift/windows-machine-config-operator/pkg/metrics" +) + +//+kubebuilder:rbac:groups="",resources=namespaces,verbs=get;list;watch +//+kubebuilder:rbac:groups="",resources=services;services/finalizers,verbs=create;get;delete +//+kubebuilder:rbac:groups="monitoring.coreos.com",resources=servicemonitors,verbs=create;get;delete +//+kubebuilder:rbac:groups="",resources=events,verbs=* + +const ( + // MetricController is the name of this controller in logs and other outputs. + MetricController = "metrics" +) + +type metricReconciler struct { + *monclient.MonitoringV1Client + instanceReconciler +} + +func NewMetricReconciler(mgr manager.Manager, clusterConfig cluster.Config, cfg *rest.Config, watchNamespace string) (*metricReconciler, error) { + clientset, err := kubernetes.NewForConfig(mgr.GetConfig()) + if err != nil { + return nil, fmt.Errorf("error creating kubernetes clientset: %w", err) + } + mclient, err := monclient.NewForConfig(cfg) + if err != nil { + return nil, fmt.Errorf("error creating monitoring client: %w", err) + } + return &metricReconciler{ + MonitoringV1Client: mclient, + instanceReconciler: instanceReconciler{ + client: mgr.GetClient(), + log: ctrl.Log.WithName("controllers").WithName(MetricController), + k8sclientset: clientset, + clusterServiceCIDR: clusterConfig.Network().GetServiceCIDR(), + watchNamespace: watchNamespace, + recorder: mgr.GetEventRecorderFor(MetricController), + }, + }, nil +} + +// Reconcile is part of the main kubernetes reconciliation loop which reads that state of the cluster for a +// Node object and aims to move the current state of the cluster closer to the desired state. +func (r *metricReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, err error) { + r.log = r.log.WithValues(MetricController, req.NamespacedName) + // Prevent WMCO upgrades while Node objects are being processed + if err := condition.MarkAsBusy(r.client, r.watchNamespace, r.recorder, MetricController); err != nil { + return ctrl.Result{}, err + } + defer func() { + err = markAsFreeOnSuccess(r.client, r.watchNamespace, r.recorder, MetricController, result.Requeue, err) + }() + // validate if cluster monitoring is enabled in the operator namespace + enabled, err := r.validate(ctx) + if err != nil { + return ctrl.Result{}, fmt.Errorf("error validating cluster monitoring label: %s", err) + } + // Proceed only if monitoring is enabled + if !enabled { + return ctrl.Result{}, nil + } + if err := r.ensureServiceMonitor(); err != nil { + return ctrl.Result{}, fmt.Errorf("error ensuring serviceMonitor exists: %w", err) + } + return ctrl.Result{}, nil +} + +// validate will verify if cluster monitoring is enabled in the operator namespace. If the label is set to false or not +// present, it will log and send warning events to the user. If the label holds a non-boolean value, returns an error. +func (r *metricReconciler) validate(ctx context.Context) (bool, error) { + // validate if metrics label is added to namespace + labelValue := false + var err error + wmcoNamespace, err := r.k8sclientset.CoreV1().Namespaces().Get(ctx, r.watchNamespace, metav1.GetOptions{}) + if err != nil { + return false, fmt.Errorf("error getting operator namespace: %w", err) + } + // if the label exists, update value from default of false + if value, ok := wmcoNamespace.Labels["openshift.io/cluster-monitoring"]; ok { + labelValue, err = strconv.ParseBool(value) + if err != nil { + return false, fmt.Errorf("monitoring label must have a boolean value: %w", err) + } + } + if !labelValue { + r.recorder.Eventf(wmcoNamespace, v1.EventTypeWarning, "labelValidationFailed", + "Cluster monitoring openshift.io/cluster-monitoring=true label is not enabled in %s namespace", + r.watchNamespace) + } + return labelValue, nil +} + +// ensureServiceMonitor creates a serviceMonitor object in the operator namespace if it does not exist. +func (r *metricReconciler) ensureServiceMonitor() error { + // get existing serviceMonitor object if it exists + existingSM, err := r.ServiceMonitors(r.watchNamespace).Get(context.TODO(), metrics.WindowsMetricsResource, metav1.GetOptions{}) + if err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf( + "error retrieving %s serviceMonitor: %w", metrics.WindowsMetricsResource, err) + } + + serverName := fmt.Sprintf("%s.%s.svc", metrics.WindowsMetricsResource, r.watchNamespace) + replacement0 := "$1" + replacement1 := "$1:" + string(metrics.Port) + replacement2 := metrics.WindowsMetricsResource + attachMetadataBool := true + expectedSM := &monv1.ServiceMonitor{ + ObjectMeta: metav1.ObjectMeta{ + Name: metrics.WindowsMetricsResource, + Namespace: r.watchNamespace, + Labels: map[string]string{ + "name": metrics.WindowsMetricsResource, + }, + }, + Spec: monv1.ServiceMonitorSpec{ + AttachMetadata: &monv1.AttachMetadata{ + Node: &attachMetadataBool, + }, + Endpoints: []monv1.Endpoint{ + { + HonorLabels: true, + Interval: "30s", + Path: "/metrics", + Port: "metrics", + Scheme: "https", + BearerTokenFile: "/var/run/secrets/kubernetes.io/serviceaccount/token", + TLSConfig: &monv1.TLSConfig{ + CAFile: "/etc/prometheus/configmaps/serving-certs-ca-bundle/service-ca.crt", + SafeTLSConfig: monv1.SafeTLSConfig{ + ServerName: &serverName, + }, + }, + RelabelConfigs: []monv1.RelabelConfig{ + { + Action: "replace", + Regex: "(.*)", + Replacement: &replacement0, + TargetLabel: "instance", + SourceLabels: []monv1.LabelName{ + "__meta_kubernetes_endpoint_address_target_name", + }, + }, + { // Include only Windows nodes for this serviceMonitor + Action: "keep", + Regex: "windows", + SourceLabels: []monv1.LabelName{ + "__meta_kubernetes_node_label_kubernetes_io_os", + }, + }, + { // Change the port from the kubelet port 10250 to 9182 + Action: "replace", + Regex: "(.+)(?::\\d+)", + Replacement: &replacement1, + TargetLabel: "__address__", + SourceLabels: []monv1.LabelName{ + "__address__", + }, + }, + { // Update the job label from kubelet to windows-exporter + Action: "replace", + Replacement: &replacement2, + TargetLabel: "job", + }, + }, + }, + }, + NamespaceSelector: monv1.NamespaceSelector{ + MatchNames: []string{"kube-system"}, + }, + Selector: metav1.LabelSelector{ + MatchLabels: map[string]string{ + "k8s-app": "kubelet", + }, + }, + }, + } + + if err == nil { + // check if existing serviceMonitor's contents are as expected, delete it if not + if existingSM.Name == expectedSM.Name && existingSM.Namespace == expectedSM.Namespace && + reflect.DeepEqual(existingSM.Spec, expectedSM.Spec) { + return nil + } + err = r.ServiceMonitors(r.watchNamespace).Delete(context.TODO(), metrics.WindowsMetricsResource, + metav1.DeleteOptions{}) + if err != nil { + return fmt.Errorf("unable to delete service monitor %s/%s: %w", r.watchNamespace, metrics.WindowsMetricsResource, + err) + } + r.log.Info("Deleted malformed resource", "serviceMonitor", metrics.WindowsMetricsResource, + "namespace", r.watchNamespace) + } + + _, err = r.ServiceMonitors(r.watchNamespace).Create(context.TODO(), expectedSM, metav1.CreateOptions{}) + if err != nil { + return fmt.Errorf("error creating service monitor: %w", err) + } + return nil +} + +// SetupWithManager sets up the controller with the Manager. +func (r *metricReconciler) SetupWithManager(mgr ctrl.Manager) error { + metricsPredicate := predicate.Funcs{ + CreateFunc: func(e event.CreateEvent) bool { + return isMonitoringEnabled(e.Object, r.watchNamespace) + }, + UpdateFunc: func(e event.UpdateEvent) bool { + return isMonitoringEnabled(e.ObjectNew, r.watchNamespace) + }, + GenericFunc: func(e event.GenericEvent) bool { + return isMonitoringEnabled(e.Object, r.watchNamespace) + }, + DeleteFunc: func(e event.DeleteEvent) bool { + return false + }, + } + return ctrl.NewControllerManagedBy(mgr). + For(&v1.Namespace{}, builder.WithPredicates(metricsPredicate)). + Complete(r) +} + +// isMonitoringEnabled returns true if the given object namespace has monitoring label set to true +func isMonitoringEnabled(obj runtime.Object, watchNamespace string) bool { + namespace, ok := obj.(*v1.Namespace) + if !ok { + return false + } + if namespace.GetName() != watchNamespace { + return false + } + if value, ok := namespace.Labels["openshift.io/cluster-monitoring"]; ok { + labelValue, err := strconv.ParseBool(value) + if err != nil { + return false + } + return labelValue + } + return false +} diff --git a/controllers/windowsmachine_controller.go b/controllers/windowsmachine_controller.go index 98e28b07e4..ed5cab2832 100644 --- a/controllers/windowsmachine_controller.go +++ b/controllers/windowsmachine_controller.go @@ -30,7 +30,6 @@ import ( "github.com/openshift/windows-machine-config-operator/pkg/crypto" "github.com/openshift/windows-machine-config-operator/pkg/instance" "github.com/openshift/windows-machine-config-operator/pkg/metadata" - "github.com/openshift/windows-machine-config-operator/pkg/metrics" "github.com/openshift/windows-machine-config-operator/pkg/nodeconfig" "github.com/openshift/windows-machine-config-operator/pkg/secrets" "github.com/openshift/windows-machine-config-operator/pkg/signer" @@ -82,22 +81,15 @@ func NewWindowsMachineReconciler(mgr manager.Manager, clusterConfig cluster.Conf return nil, fmt.Errorf("error creating machine client: %w", err) } - // Initialize prometheus configuration - pc, err := metrics.NewPrometheusNodeConfig(clientset, watchNamespace) - if err != nil { - return nil, fmt.Errorf("unable to initialize Prometheus configuration: %w", err) - } - return &WindowsMachineReconciler{ instanceReconciler: instanceReconciler{ - client: mgr.GetClient(), - log: ctrl.Log.WithName("controller").WithName(WindowsMachineController), - k8sclientset: clientset, - clusterServiceCIDR: clusterConfig.Network().GetServiceCIDR(), - recorder: mgr.GetEventRecorderFor(WindowsMachineController), - watchNamespace: watchNamespace, - prometheusNodeConfig: pc, - platform: clusterConfig.Platform(), + client: mgr.GetClient(), + log: ctrl.Log.WithName("controller").WithName(WindowsMachineController), + k8sclientset: clientset, + clusterServiceCIDR: clusterConfig.Network().GetServiceCIDR(), + recorder: mgr.GetEventRecorderFor(WindowsMachineController), + watchNamespace: watchNamespace, + platform: clusterConfig.Platform(), }, machineClient: machineClient, }, nil @@ -243,7 +235,7 @@ func (r *WindowsMachineReconciler) Reconcile(ctx context.Context, // In the case the machine was deleted, ensure that the metrics subsets are configured properly, so that // the current Windows nodes are properly reflected there. log.V(1).Info("not found") - return ctrl.Result{}, r.prometheusNodeConfig.Configure() + return ctrl.Result{}, nil } // Error reading the object - requeue the request. return ctrl.Result{}, err @@ -299,21 +291,11 @@ func (r *WindowsMachineReconciler) Reconcile(ctx context.Context, } if node.Annotations[metadata.VersionAnnotation] == version.Get() { // version annotation exists with a valid value, node is fully configured. - // configure Prometheus when we have already configured Windows Nodes. This is required to update - // Endpoints object if it gets reverted when the operator pod restarts. - if err := r.prometheusNodeConfig.Configure(); err != nil { - return ctrl.Result{}, fmt.Errorf("unable to configure Prometheus: %w", err) - } return ctrl.Result{}, nil } } } else if *machine.Status.Phase != provisionedPhase { log.V(1).Info("machine not provisioned", "phase", *machine.Status.Phase) - // configure Prometheus when a machine is not in `Running` or `Provisioned` phase. This configuration is - // required to update Endpoints object when Windows machines are being deleted. - if err := r.prometheusNodeConfig.Configure(); err != nil { - return ctrl.Result{}, fmt.Errorf("unable to configure Prometheus: %w", err) - } // Machine is not in provisioned or running state, nothing we should do as of now return ctrl.Result{}, nil } @@ -360,10 +342,6 @@ func (r *WindowsMachineReconciler) Reconcile(ctx context.Context, } r.recorder.Eventf(machine, core.EventTypeNormal, "MachineSetup", "Machine %s configured successfully", machine.Name) - // configure Prometheus after a Windows machine is configured as a Node. - if err := r.prometheusNodeConfig.Configure(); err != nil { - return ctrl.Result{}, fmt.Errorf("unable to configure Prometheus: %w", err) - } return ctrl.Result{}, nil } diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index dc5410dad5..09b9c976eb 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -1,384 +1,11 @@ package metrics -import ( - "context" - "encoding/json" - "fmt" - "reflect" - "strconv" - - monv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" - monclient "github.com/prometheus-operator/prometheus-operator/pkg/client/versioned/typed/monitoring/v1" - "k8s.io/api/core/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/kubernetes" - k8sclient "k8s.io/client-go/kubernetes" - "k8s.io/client-go/rest" - "k8s.io/client-go/tools/record" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/manager" - - "github.com/openshift/windows-machine-config-operator/pkg/nodeconfig" - "github.com/openshift/windows-machine-config-operator/pkg/patch" -) - -//+kubebuilder:rbac:groups="",resources=services;services/finalizers,verbs=create;get;delete -//+kubebuilder:rbac:groups="",resources=endpoints,verbs=create;get;delete;update;patch -//+kubebuilder:rbac:groups="",resources=namespaces,verbs=get -//+kubebuilder:rbac:groups="",resources=nodes,verbs=list -//+kubebuilder:rbac:groups="monitoring.coreos.com",resources=servicemonitors,verbs=create;get;delete -//+kubebuilder:rbac:groups="",resources=events,verbs=* - -var ( - log = ctrl.Log.WithName("metrics") - // metricsEnabled specifies if metrics are enabled in the current cluster - metricsEnabled = true -) - const ( - // metricsPortName specifies the portname used for Prometheus monitoring + // PortName specifies the portname used for Prometheus monitoring PortName = "metrics" - // Host is the host address used by Windows metrics - Host = "0.0.0.0" // Port is the port number on which windows-exporter is exposed. Port int32 = 9182 // WindowsMetricsResource is the name for objects created for Prometheus monitoring // by current operator version. Its name is defined through the bundle manifests WindowsMetricsResource = "windows-exporter" ) - -// PrometheusNodeConfig holds the information required to configure Prometheus, so that it can scrape metrics from the -// given endpoint address -type PrometheusNodeConfig struct { - // k8sclientset is a handle that allows us to interact with the Kubernetes API. - k8sclientset *kubernetes.Clientset - // namespace is the namespace in which metrics endpoints object is created - namespace string -} - -// Config holds the information required to interact with metrics objects -type Config struct { - // a handle that allows us to interact with the Kubernetes API. - *kubernetes.Clientset - // a handle that allows us to interact with the Monitoring API. - *monclient.MonitoringV1Client - // namespace is the namespace in which metrics objects are created - namespace string - // recorder to generate events - recorder record.EventRecorder -} - -// NewPrometheuopsNodeConfig creates a new instance for prometheusNodeConfig to be used by the caller. -func NewPrometheusNodeConfig(clientset *kubernetes.Clientset, watchNamespace string) (*PrometheusNodeConfig, error) { - - return &PrometheusNodeConfig{ - k8sclientset: clientset, - namespace: watchNamespace, - }, nil -} - -// NewConfig creates a new instance for Config to be used by the caller. -func NewConfig(mgr manager.Manager, cfg *rest.Config, namespace string) (*Config, error) { - if cfg == nil { - return nil, fmt.Errorf("config should not be nil") - } - oclient, err := k8sclient.NewForConfig(cfg) - if err != nil { - return nil, fmt.Errorf("error creating config client: %w", err) - } - mclient, err := monclient.NewForConfig(cfg) - if err != nil { - return nil, fmt.Errorf("error creating monitoring client: %w", err) - } - return &Config{Clientset: oclient, - MonitoringV1Client: mclient, - namespace: namespace, - recorder: mgr.GetEventRecorderFor("metrics"), - }, nil -} - -// syncMetricsEndpoint updates the endpoint object with the new list of IP addresses from the Windows nodes and the -// metrics port. -func (pc *PrometheusNodeConfig) syncMetricsEndpoint(nodeEndpointAdressess []v1.EndpointAddress) error { - // Update EndpointSubset field with list of Windows Nodes endpoint addresses and required metrics port information - // We need to patch the entire endpoint subset field, since addresses and ports both fields are deleted when there - // are no Windows nodes. - var subsets []v1.EndpointSubset - if nodeEndpointAdressess != nil { - subsets = []v1.EndpointSubset{{ - Addresses: nodeEndpointAdressess, - Ports: []v1.EndpointPort{{ - Name: PortName, - Port: Port, - Protocol: v1.ProtocolTCP, - }}, - }} - } - - patchData := []*patch.JSONPatch{patch.NewJSONPatch("replace", "/subsets", subsets)} - // convert patch data to bytes - patchDataBytes, err := json.Marshal(patchData) - if err != nil { - return fmt.Errorf("unable to get patch data in bytes: %w", err) - } - - _, err = pc.k8sclientset.CoreV1().Endpoints(pc.namespace). - Patch(context.TODO(), WindowsMetricsResource, types.JSONPatchType, patchDataBytes, metav1.PatchOptions{}) - if err != nil { - return fmt.Errorf("unable to sync metrics endpoints: %w", err) - } - return nil -} - -// Configure patches the endpoint object to reflect the current list Windows nodes. -func (pc *PrometheusNodeConfig) Configure() error { - // Check if metrics are enabled in current cluster - if !metricsEnabled { - log.Info("install the prometheus-operator to enable Prometheus configuration") - return nil - } - // get list of Windows nodes that are in Ready phase - nodes, err := pc.k8sclientset.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{LabelSelector: nodeconfig.WindowsOSLabel, - FieldSelector: "spec.unschedulable=false"}) - if err != nil { - return fmt.Errorf("could not get Windows nodes: %w", err) - } - - // get Metrics Endpoints object - endpoints, err := pc.k8sclientset.CoreV1().Endpoints(pc.namespace).Get(context.TODO(), - WindowsMetricsResource, metav1.GetOptions{}) - if err != nil { - return fmt.Errorf("could not get metrics endpoints %v: %w", WindowsMetricsResource, err) - } - - if !isEndpointsValid(nodes, endpoints) { - // check if we can get list of endpoint addresses - windowsIPList := getNodeEndpointAddresses(nodes) - // sync metrics endpoints object with the current list of addresses - if err := pc.syncMetricsEndpoint(windowsIPList); err != nil { - return fmt.Errorf("error updating endpoints object with list of endpoint addresses: %w", err) - } - log.Info("Prometheus configured", "endpoints", WindowsMetricsResource, "port", Port, "name", PortName) - } - return nil -} - -// getNodeEndpointAddresses returns a list of endpoint addresses according to the given list of Windows nodes -func getNodeEndpointAddresses(nodes *v1.NodeList) []v1.EndpointAddress { - // an empty list to store node IP addresses - var nodeIPAddress []v1.EndpointAddress - // loops through nodes - for _, node := range nodes.Items { - for _, address := range node.Status.Addresses { - if address.Type == "InternalIP" && address.Address != "" { - // add IP address address to the endpoint address list - nodeIPAddress = append(nodeIPAddress, v1.EndpointAddress{ - IP: address.Address, - Hostname: "", - NodeName: nil, - TargetRef: &v1.ObjectReference{ - Kind: "Node", - Name: node.Name, - }, - }) - break - } - } - } - return nodeIPAddress -} - -// isEndpointsValid returns true if Endpoints object has entries for all the Windows nodes in the cluster. -// It returns false when any one of the Windows nodes is not present in the subset. -func isEndpointsValid(nodes *v1.NodeList, endpoints *v1.Endpoints) bool { - // check if number of entries in endpoints object match number of Ready Windows nodes - if len(endpoints.Subsets) == 0 || len(nodes.Items) != len(endpoints.Subsets[0].Addresses) { - return false - } - - for _, node := range nodes.Items { - nodeFound := false - for _, address := range endpoints.Subsets[0].Addresses { - // check TargetRef is present and has the expected kind - if address.TargetRef == nil || address.TargetRef.Kind != "Node" { - // otherwise, skip the invalid address - continue - } - if address.TargetRef.Name == node.Name { - nodeFound = true - break - } - } - if !nodeFound { - return false - } - } - return true -} - -// Configure takes care of all the required configuration steps -// for Prometheus monitoring like validating monitoring label -// and creating metrics Endpoints object. -func (c *Config) Configure(ctx context.Context) error { - // validate if cluster monitoring is enabled in the operator namespace - enabled, err := c.validate(ctx) - if err != nil { - return fmt.Errorf("error validating cluster monitoring label: %s", err) - } - // Create Metrics Endpoint object only if monitoring is enabled - if !enabled { - return nil - } - if err := c.ensureServiceMonitor(); err != nil { - return fmt.Errorf("error ensuring serviceMonitor exists: %w", err) - } - // In the case of an operator restart, a previous Endpoint object will be deleted and a new one will - // be created to ensure we have a correct spec. - var subsets []v1.EndpointSubset - existingEndpoint, err := c.CoreV1().Endpoints(c.namespace).Get(ctx, WindowsMetricsResource, metav1.GetOptions{}) - if err != nil { - if !apierrors.IsNotFound(err) { - return fmt.Errorf("error retrieving %s endpoint: %w", WindowsMetricsResource, err) - } - } else { - subsets = existingEndpoint.Subsets - err = c.CoreV1().Endpoints(c.namespace).Delete(ctx, WindowsMetricsResource, metav1.DeleteOptions{}) - if err != nil { - return fmt.Errorf("error deleting %s endpoint: %w", WindowsMetricsResource, err) - } - } - if err := c.createEndpoint(subsets); err != nil { - return fmt.Errorf("error creating metrics Endpoint: %w", err) - } - return nil -} - -// validate will verify if cluster monitoring is enabled in the operator namespace. If the label is set to false or not -// present, it will log and send warning events to the user. If the label holds a non-boolean value, returns an error. -func (c *Config) validate(ctx context.Context) (bool, error) { - // validate if metrics label is added to namespace - wmcoNamespace, err := c.CoreV1().Namespaces().Get(ctx, c.namespace, metav1.GetOptions{}) - if err != nil { - return false, fmt.Errorf("error getting operator namespace: %w", err) - } - - labelValue := false - // if the label exists, update value from default of false - if value, ok := wmcoNamespace.Labels["openshift.io/cluster-monitoring"]; ok { - labelValue, err = strconv.ParseBool(value) - if err != nil { - return false, fmt.Errorf("monitoring label must have a boolean value: %w", err) - } - } - if !labelValue { - c.recorder.Eventf(wmcoNamespace, v1.EventTypeWarning, "labelValidationFailed", - "Cluster monitoring openshift.io/cluster-monitoring=true label is not enabled in %s namespace", c.namespace) - } - metricsEnabled = labelValue - return metricsEnabled, nil -} - -// createEndpoint creates an endpoint object in the operator namespace. -// WMCO is no longer creating a service with a selector therefore no Endpoint -// object is created and WMCO needs to create the Endpoint object. -// We cannot create endpoints as a part of manifests deployment as -// Endpoints resources are not currently OLM-supported for bundle creation. -func (c *Config) createEndpoint(subsets []v1.EndpointSubset) error { - // create new Endpoint - newEndpoint := &v1.Endpoints{ - TypeMeta: metav1.TypeMeta{ - Kind: "Endpoints", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: WindowsMetricsResource, - Namespace: c.namespace, - Labels: map[string]string{"name": WindowsMetricsResource}, - }, - Subsets: subsets, - } - _, err := c.CoreV1().Endpoints(c.namespace).Create(context.TODO(), - newEndpoint, metav1.CreateOptions{}) - if err != nil { - return fmt.Errorf("error creating metrics Endpoint: %w", err) - } - return nil -} - -// ensureServiceMonitor creates a serviceMonitor object in the operator namespace if it does not exist. -func (c *Config) ensureServiceMonitor() error { - // get existing serviceMonitor object if it exists - existingSM, err := c.ServiceMonitors(c.namespace).Get(context.TODO(), WindowsMetricsResource, metav1.GetOptions{}) - if err != nil && !apierrors.IsNotFound(err) { - return fmt.Errorf("error retrieving %s serviceMonitor: %w", WindowsMetricsResource, err) - } - - serverName := fmt.Sprintf("%s.%s.svc", WindowsMetricsResource, c.namespace) - replacement := "$1" - expectedSM := &monv1.ServiceMonitor{ - ObjectMeta: metav1.ObjectMeta{ - Name: WindowsMetricsResource, - Namespace: c.namespace, - Labels: map[string]string{ - "name": WindowsMetricsResource, - }, - }, - Spec: monv1.ServiceMonitorSpec{ - Endpoints: []monv1.Endpoint{ - { - HonorLabels: true, - Interval: "30s", - Path: "/metrics", - Port: "metrics", - Scheme: "https", - BearerTokenFile: "/var/run/secrets/kubernetes.io/serviceaccount/token", - TLSConfig: &monv1.TLSConfig{ - CAFile: "/etc/prometheus/configmaps/serving-certs-ca-bundle/service-ca.crt", - SafeTLSConfig: monv1.SafeTLSConfig{ - ServerName: &serverName, - }, - }, - RelabelConfigs: []monv1.RelabelConfig{ - { - Action: "replace", - Regex: "(.*)", - Replacement: &replacement, - TargetLabel: "instance", - SourceLabels: []monv1.LabelName{ - "__meta_kubernetes_endpoint_address_target_name", - }, - }, - }, - }, - }, - Selector: metav1.LabelSelector{ - MatchLabels: map[string]string{ - "name": WindowsMetricsResource, - }, - }, - }, - } - - if err == nil { - // check if existing serviceMonitor's contents are as expected, delete it if not - if existingSM.Name == expectedSM.Name && existingSM.Namespace == expectedSM.Namespace && - reflect.DeepEqual(existingSM.Spec, expectedSM.Spec) { - return nil - } - err = c.ServiceMonitors(c.namespace).Delete(context.TODO(), WindowsMetricsResource, - metav1.DeleteOptions{}) - if err != nil { - return fmt.Errorf("unable to delete service monitor %s/%s: %w", c.namespace, WindowsMetricsResource, - err) - } - log.Info("Deleted malformed resource", "serviceMonitor", WindowsMetricsResource, - "namespace", c.namespace) - } - - _, err = c.ServiceMonitors(c.namespace).Create(context.TODO(), expectedSM, metav1.CreateOptions{}) - if err != nil { - return fmt.Errorf("error creating service monitor: %w", err) - } - return nil -} diff --git a/pkg/metrics/metrics_test.go b/pkg/metrics/metrics_test.go deleted file mode 100644 index 1a4f240b52..0000000000 --- a/pkg/metrics/metrics_test.go +++ /dev/null @@ -1,201 +0,0 @@ -package metrics - -import ( - "testing" - - "github.com/stretchr/testify/assert" - "k8s.io/api/core/v1" - meta "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -func TestIsEndpointsValid(t *testing.T) { - tests := []struct { - name string - nodes *v1.NodeList - endpoints *v1.Endpoints - want bool - }{ - { - name: "EndpointAddresses match nodes count and name", - nodes: &v1.NodeList{ - Items: []v1.Node{ - { - ObjectMeta: meta.ObjectMeta{Name: "the-node-name"}, - }, - }, - }, - endpoints: &v1.Endpoints{ - Subsets: []v1.EndpointSubset{ - {Addresses: []v1.EndpointAddress{ - { - TargetRef: &v1.ObjectReference{ - Kind: "Node", - Name: "the-node-name", - }, - }, - }}, - }, - }, - want: true, - }, - { - name: "EndpointAddresses match two nodes", - nodes: &v1.NodeList{ - Items: []v1.Node{ - { - ObjectMeta: meta.ObjectMeta{Name: "the-node-name-one"}, - }, - { - ObjectMeta: meta.ObjectMeta{Name: "the-node-name-two"}, - }, - }, - }, - endpoints: &v1.Endpoints{ - Subsets: []v1.EndpointSubset{ - { - Addresses: []v1.EndpointAddress{ - { - TargetRef: &v1.ObjectReference{ - Kind: "Node", - Name: "the-node-name-two", - }, - }, - { - TargetRef: &v1.ObjectReference{ - Kind: "Node", - Name: "the-node-name-one", - }, - }, - }, - }, - }, - }, - want: true, - }, - { - name: "No Endpoint Subsets", - nodes: &v1.NodeList{ - Items: []v1.Node{ - { - ObjectMeta: meta.ObjectMeta{Name: "the-node-name"}, - }, - }, - }, - endpoints: &v1.Endpoints{ - Subsets: []v1.EndpointSubset{}, - }, - want: false, - }, - { - name: "No nodes", - nodes: &v1.NodeList{ - Items: []v1.Node{}, - }, - endpoints: &v1.Endpoints{ - Subsets: []v1.EndpointSubset{ - {Addresses: []v1.EndpointAddress{ - { - TargetRef: &v1.ObjectReference{ - Kind: "Node", - Name: "the-node-name", - }, - }, - }}, - }, - }, - want: false, - }, - { - name: "EndpointAddress does not match node name", - nodes: &v1.NodeList{ - Items: []v1.Node{ - { - ObjectMeta: meta.ObjectMeta{Name: "the-node-name"}, - }, - }, - }, - endpoints: &v1.Endpoints{ - Subsets: []v1.EndpointSubset{ - {Addresses: []v1.EndpointAddress{ - { - TargetRef: &v1.ObjectReference{ - Kind: "Node", - Name: "wrong-node-name", - }, - }, - }}, - }, - }, - want: false, - }, - { - name: "EndpointAddress without targetRef", - nodes: &v1.NodeList{ - Items: []v1.Node{ - { - ObjectMeta: meta.ObjectMeta{Name: "the-node-name"}, - }, - }, - }, - endpoints: &v1.Endpoints{ - Subsets: []v1.EndpointSubset{ - {Addresses: []v1.EndpointAddress{ - { - IP: "1.2.3.4", - }, - }}, - }, - }, - want: false, - }, - { - name: "EndpointAddress with targetRef without name", - nodes: &v1.NodeList{ - Items: []v1.Node{ - { - ObjectMeta: meta.ObjectMeta{Name: "the-node-name"}, - }, - }, - }, - endpoints: &v1.Endpoints{ - Subsets: []v1.EndpointSubset{ - {Addresses: []v1.EndpointAddress{ - { - TargetRef: &v1.ObjectReference{ - Kind: "Node", - }, - }, - }}, - }, - }, - want: false, - }, - { - name: "EndpointAddress with targetRef invalid kind", - nodes: &v1.NodeList{ - Items: []v1.Node{ - { - ObjectMeta: meta.ObjectMeta{Name: "the-node-name"}, - }, - }, - }, - endpoints: &v1.Endpoints{ - Subsets: []v1.EndpointSubset{ - {Addresses: []v1.EndpointAddress{ - { - TargetRef: &v1.ObjectReference{ - Kind: "AnotherKind", - }, - }, - }}, - }, - }, - want: false, - }, - } - for _, test := range tests { - t.Run(test.name, func(t *testing.T) { - assert.Equal(t, test.want, isEndpointsValid(test.nodes, test.endpoints)) - }) - } -} diff --git a/test/e2e/metrics_test.go b/test/e2e/metrics_test.go index 8e56ed0962..2c7fc6a35a 100644 --- a/test/e2e/metrics_test.go +++ b/test/e2e/metrics_test.go @@ -79,46 +79,6 @@ func (tc *testContext) testPrometheus(t *testing.T) { metrics.WindowsMetricsResource, metav1.GetOptions{}) require.NoError(t, err, "error getting service monitor") - // check that endpoints exists - windowsEndpoints, err := tc.client.K8s.CoreV1().Endpoints(wmcoNamespace).Get(context.TODO(), - metrics.WindowsMetricsResource, metav1.GetOptions{}) - require.NoError(t, err) - - if len(gc.allNodes()) == 0 { - // check if all entries in subset are deleted when there are no Windows Nodes - require.Equal(t, 0, len(windowsEndpoints.Subsets)) - } else { - // Total length of list for subsets is always equal to the list of Windows Nodes. - require.Equal(t, len(gc.allNodes()), len(windowsEndpoints.Subsets[0].Addresses)) - - // check Nodes in the targetRef of Endpoints are same as the Windows Nodes bootstrapped using WMCO - err = checkTargetNodes(windowsEndpoints) - require.NoError(t, err) - - // check Port name matches - require.Equal(t, windowsEndpoints.Subsets[0].Ports[0].Name, metrics.PortName) - // check Port matches the defined port - require.Equal(t, windowsEndpoints.Subsets[0].Ports[0].Port, metrics.Port) - } - -} - -// checkTargetNodes checks if nodes in the targetRef of Endpoints are same as the Windows Nodes bootstrapped using WMCO -func checkTargetNodes(windowsEndpoints *core.Endpoints) error { - for _, node := range gc.allNodes() { - foundNode := false - for _, endpointAddress := range windowsEndpoints.Subsets[0].Addresses { - if node.Name == endpointAddress.TargetRef.Name { - foundNode = true - break - } - } - if !foundNode { - return fmt.Errorf("target node not found in Endpoints object") - } - } - - return nil } // PrometheusQuery defines the result of the /query request