diff --git a/controllers/nicclusterpolicy_controller.go b/controllers/nicclusterpolicy_controller.go index dbad39fee..1674a885f 100644 --- a/controllers/nicclusterpolicy_controller.go +++ b/controllers/nicclusterpolicy_controller.go @@ -52,6 +52,7 @@ type NicClusterPolicyReconciler struct { Scheme *runtime.Scheme ClusterTypeProvider clustertype.Provider StaticConfigProvider staticconfig.Provider + MigrationCh chan struct{} stateManager state.Manager } @@ -86,6 +87,11 @@ type NicClusterPolicyReconciler struct { // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. func (r *NicClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + select { + case <-r.MigrationCh: + case <-ctx.Done(): + return ctrl.Result{}, fmt.Errorf("canceled") + } reqLogger := log.FromContext(ctx) reqLogger.V(consts.LogLevelInfo).Info("Reconciling NicClusterPolicy") @@ -179,6 +185,10 @@ func (r *NicClusterPolicyReconciler) handleMOFEDWaitLabels( _ = r.Client.List(ctx, pods, client.MatchingLabels{"driver-pod": podLabel}) for i := range pods.Items { pod := pods.Items[i] + if pod.Spec.NodeName == "" { + // In case that Pod is in Pending state + continue + } labelValue := "true" // We assume that OFED pod contains only one container to simplify the logic. // We can revisit this logic in the future if needed diff --git a/controllers/suite_test.go b/controllers/suite_test.go index 2defe57bc..9972b19eb 100644 --- a/controllers/suite_test.go +++ b/controllers/suite_test.go @@ -121,11 +121,15 @@ var _ = BeforeSuite(func() { Expect(err).NotTo(HaveOccurred()) staticConfigProvider := staticconfig.NewProvider(staticconfig.StaticConfig{CniBinDirectory: "/opt/cni/bin"}) + migrationCompletionChan := make(chan struct{}) + close(migrationCompletionChan) + err = (&NicClusterPolicyReconciler{ Client: k8sManager.GetClient(), Scheme: k8sManager.GetScheme(), ClusterTypeProvider: clusterTypeProvider, StaticConfigProvider: staticConfigProvider, + MigrationCh: migrationCompletionChan, }).SetupWithManager(k8sManager, testSetupLog) Expect(err).ToNot(HaveOccurred()) diff --git a/controllers/upgrade_controller.go b/controllers/upgrade_controller.go index 01f4599e7..309c50e91 100644 --- a/controllers/upgrade_controller.go +++ b/controllers/upgrade_controller.go @@ -18,6 +18,7 @@ package controllers import ( "context" + "fmt" "time" "github.com/NVIDIA/k8s-operator-libs/pkg/upgrade" @@ -47,6 +48,7 @@ type UpgradeReconciler struct { client.Client Scheme *runtime.Scheme StateManager upgrade.ClusterUpgradeStateManager + MigrationCh chan struct{} } const plannedRequeueInterval = time.Minute * 2 @@ -64,6 +66,11 @@ const UpgradeStateAnnotation = "nvidia.com/ofed-upgrade-state" // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. func (r *UpgradeReconciler) Reconcile(ctx context.Context, _ ctrl.Request) (ctrl.Result, error) { + select { + case <-r.MigrationCh: + case <-ctx.Done(): + return ctrl.Result{}, fmt.Errorf("canceled") + } reqLogger := log.FromContext(ctx) reqLogger.V(consts.LogLevelInfo).Info("Reconciling Upgrade") diff --git a/controllers/upgrade_controller_test.go b/controllers/upgrade_controller_test.go index 3ecbb6ca2..c59d1063b 100644 --- a/controllers/upgrade_controller_test.go +++ b/controllers/upgrade_controller_test.go @@ -55,9 +55,12 @@ var _ = Describe("Upgrade Controller", func() { }) Context("When NicClusterPolicy CR is created", func() { It("Upgrade policy is disabled", func() { + migrationCompletionChan := make(chan struct{}) + close(migrationCompletionChan) upgradeReconciler := &UpgradeReconciler{ - Client: k8sClient, - Scheme: k8sClient.Scheme(), + Client: k8sClient, + Scheme: k8sClient.Scheme(), + MigrationCh: migrationCompletionChan, } req := ctrl.Request{NamespacedName: types.NamespacedName{Name: consts.NicClusterPolicyResourceName}} @@ -76,10 +79,13 @@ var _ = Describe("Upgrade Controller", func() { err := k8sClient.Create(goctx.TODO(), node) Expect(err).NotTo(HaveOccurred()) } + migrationCompletionChan := make(chan struct{}) + close(migrationCompletionChan) upgradeReconciler := &UpgradeReconciler{ - Client: k8sClient, - Scheme: k8sClient.Scheme(), + Client: k8sClient, + Scheme: k8sClient.Scheme(), + MigrationCh: migrationCompletionChan, } // Call removeNodeUpgradeStateLabels function err := upgradeReconciler.removeNodeUpgradeStateLabels(goctx.TODO()) diff --git a/main.go b/main.go index 66e42cdbb..522801091 100644 --- a/main.go +++ b/main.go @@ -77,7 +77,7 @@ func setupWebhookControllers(mgr ctrl.Manager) error { return nil } -func setupCRDControllers(ctx context.Context, c client.Client, mgr ctrl.Manager) error { +func setupCRDControllers(ctx context.Context, c client.Client, mgr ctrl.Manager, migrationChan chan struct{}) error { ctrLog := setupLog.WithName("controller") clusterTypeProvider, err := clustertype.NewProvider(ctx, c) @@ -93,6 +93,7 @@ func setupCRDControllers(ctx context.Context, c client.Client, mgr ctrl.Manager) Scheme: mgr.GetScheme(), ClusterTypeProvider: clusterTypeProvider, // we want to cache information about the cluster type StaticConfigProvider: staticInfoProvider, + MigrationCh: migrationChan, }).SetupWithManager(mgr, ctrLog.WithName("NicClusterPolicy")); err != nil { setupLog.Error(err, "unable to create controller", "controller", "NicClusterPolicy") return err @@ -161,35 +162,26 @@ func main() { os.Exit(1) } - // run migration logic before controllers start - if err := migrate.Migrate(stopCtx, setupLog.WithName("migrate"), directClient); err != nil { - setupLog.Error(err, "failed to run migration logic") - os.Exit(1) + migrationCompletionChan := make(chan struct{}) + m := migrate.Migrator{ + K8sClient: directClient, + MigrationCh: migrationCompletionChan, + LeaderElection: enableLeaderElection, + Logger: ctrl.Log.WithName("Migrator"), } - - err = setupCRDControllers(stopCtx, directClient, mgr) + err = mgr.Add(&m) if err != nil { + setupLog.Error(err, "failed to add Migrator to the Manager") os.Exit(1) } - upgrade.SetDriverName("ofed") - - upgradeLogger := ctrl.Log.WithName("controllers").WithName("Upgrade") - - clusterUpdateStateManager, err := upgrade.NewClusterUpgradeStateManager( - upgradeLogger.WithName("clusterUpgradeManager"), config.GetConfigOrDie(), nil) - + err = setupCRDControllers(stopCtx, directClient, mgr, migrationCompletionChan) if err != nil { - setupLog.Error(err, "unable to create new ClusterUpdateStateManager", "controller", "Upgrade") os.Exit(1) } - if err = (&controllers.UpgradeReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - StateManager: clusterUpdateStateManager, - }).SetupWithManager(mgr); err != nil { - setupLog.Error(err, "unable to create controller", "controller", "Upgrade") + err = setupUpgradeController(mgr, migrationCompletionChan) + if err != nil { os.Exit(1) } @@ -216,3 +208,28 @@ func main() { os.Exit(1) } } + +func setupUpgradeController(mgr ctrl.Manager, migrationChan chan struct{}) error { + upgrade.SetDriverName("ofed") + + upgradeLogger := ctrl.Log.WithName("controllers").WithName("Upgrade") + + clusterUpdateStateManager, err := upgrade.NewClusterUpgradeStateManager( + upgradeLogger.WithName("clusterUpgradeManager"), config.GetConfigOrDie(), nil) + + if err != nil { + setupLog.Error(err, "unable to create new ClusterUpdateStateManager", "controller", "Upgrade") + return err + } + + if err = (&controllers.UpgradeReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + StateManager: clusterUpdateStateManager, + MigrationCh: migrationChan, + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "Upgrade") + return err + } + return nil +} diff --git a/manifests/state-ofed-driver/0050_ofed-driver-ds.yaml b/manifests/state-ofed-driver/0050_ofed-driver-ds.yaml index 4f1da2fd1..bdc3cf8a0 100644 --- a/manifests/state-ofed-driver/0050_ofed-driver-ds.yaml +++ b/manifests/state-ofed-driver/0050_ofed-driver-ds.yaml @@ -15,20 +15,21 @@ apiVersion: apps/v1 kind: DaemonSet metadata: labels: - app: mofed-{{ .RuntimeSpec.OSName }}{{ .RuntimeSpec.OSVer }} + app: mofed-{{ .RuntimeSpec.OSName }}{{ .RuntimeSpec.OSVer }}-{{ .RuntimeSpec.KernelMajor }}.{{ .RuntimeSpec.KernelMinor }} nvidia.com/ofed-driver: "" - name: mofed-{{ .RuntimeSpec.OSName }}{{ .RuntimeSpec.OSVer }}-ds + mofed-ds-format-version: "1" + name: mofed-{{ .RuntimeSpec.OSName }}{{ .RuntimeSpec.OSVer }}-{{ .RuntimeSpec.KernelMajor }}.{{ .RuntimeSpec.KernelMinor }}-ds namespace: {{ .RuntimeSpec.Namespace }} spec: updateStrategy: type: OnDelete selector: matchLabels: - app: mofed-{{ .RuntimeSpec.OSName }}{{ .RuntimeSpec.OSVer }} + app: mofed-{{ .RuntimeSpec.OSName }}{{ .RuntimeSpec.OSVer }}-{{ .RuntimeSpec.KernelMajor }}.{{ .RuntimeSpec.KernelMinor }} template: metadata: labels: - app: mofed-{{ .RuntimeSpec.OSName }}{{ .RuntimeSpec.OSVer }} + app: mofed-{{ .RuntimeSpec.OSName }}{{ .RuntimeSpec.OSVer }}-{{ .RuntimeSpec.KernelMajor }}.{{ .RuntimeSpec.KernelMinor }} driver-pod: mofed-{{ .CrSpec.Version }} nvidia.com/ofed-driver: "" spec: @@ -40,6 +41,14 @@ spec: - key: nvidia.com/gpu operator: Exists effect: NoSchedule + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: nvidia.com/ofed-driver + operator: Exists + topologyKey: kubernetes.io/hostname serviceAccountName: ofed-driver hostNetwork: true {{- if .CrSpec.ImagePullSecrets }} @@ -161,6 +170,8 @@ spec: feature.node.kubernetes.io/pci-15b3.present: "true" feature.node.kubernetes.io/system-os_release.ID: {{ .RuntimeSpec.OSName }} feature.node.kubernetes.io/system-os_release.VERSION_ID: "{{ .RuntimeSpec.OSVer }}" + feature.node.kubernetes.io/kernel-version.major: "{{ .RuntimeSpec.KernelMajor }}" + feature.node.kubernetes.io/kernel-version.minor: "{{ .RuntimeSpec.KernelMinor }}" {{- if .NodeAffinity }} affinity: nodeAffinity: diff --git a/pkg/migrate/migrate.go b/pkg/migrate/migrate.go index 6419f55b9..801861810 100644 --- a/pkg/migrate/migrate.go +++ b/pkg/migrate/migrate.go @@ -21,6 +21,7 @@ import ( "fmt" "github.com/go-logr/logr" + appsv1 "k8s.io/api/apps/v1" v1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" apiErrors "k8s.io/apimachinery/pkg/api/errors" @@ -30,8 +31,34 @@ import ( "github.com/Mellanox/network-operator/pkg/config" "github.com/Mellanox/network-operator/pkg/consts" + + mellanoxv1alpha1 "github.com/Mellanox/network-operator/api/v1alpha1" ) +// Migrator migrates from previous versions +type Migrator struct { + K8sClient client.Client + MigrationCh chan struct{} + LeaderElection bool + Logger logr.Logger +} + +// Implements manager.NeedLeaderElection +func (m *Migrator) NeedLeaderElection() bool { + return m.LeaderElection +} + +// Implements manager.Runnable +func (m *Migrator) Start(ctx context.Context) error { + err := Migrate(ctx, m.Logger, m.K8sClient) + if err != nil { + m.Logger.Error(err, "failed to migrate Network Operator") + return err + } + close(m.MigrationCh) + return nil +} + // Migrate contains logic which should run once during controller start. // The main use case for this handler is network-operator upgrade // for example, the handler can contain logic to change old data format to a new one or @@ -54,6 +81,11 @@ func migrate(ctx context.Context, log logr.Logger, c client.Client) error { log.V(consts.LogLevelError).Error(err, "error trying to remove state label on NV IPAM configmap") return err } + if err := handleSingleMofedDS(ctx, log, c); err != nil { + // critical for the operator operation, will fail Mofed migration + log.V(consts.LogLevelError).Error(err, "error trying to remove state label on NV IPAM configmap") + return err + } return nil } @@ -114,3 +146,47 @@ func removeStateLabelFromNVIpamConfigMap(ctx context.Context, log logr.Logger, c } return nil } + +func handleSingleMofedDS(ctx context.Context, log logr.Logger, c client.Client) error { + ncp := &mellanoxv1alpha1.NicClusterPolicy{} + key := types.NamespacedName{ + Name: consts.NicClusterPolicyResourceName, + } + err := c.Get(ctx, key, ncp) + if apiErrors.IsNotFound(err) { + log.V(consts.LogLevelDebug).Info("NIC ClusterPolicy not found, skip handling single MOFED DS") + return nil + } else if err != nil { + log.V(consts.LogLevelError).Error(err, "fail to get NIC ClusterPolicy") + return err + } + if ncp.Spec.OFEDDriver == nil { + return nil + } + log.V(consts.LogLevelDebug).Info("Searching for single MOFED DS") + dsList := &appsv1.DaemonSetList{} + err = c.List(ctx, dsList, client.MatchingLabels{"nvidia.com/ofed-driver": ""}) + if err != nil { + log.V(consts.LogLevelError).Error(err, "fail to list MOFED DS") + return err + } + for i := range dsList.Items { + mofedDs := &dsList.Items[i] + // The single MOFED DS does not contain the label "mofed-ds-format-version" + _, ok := mofedDs.Labels["mofed-ds-format-version"] + if ok { + continue + } + log.V(consts.LogLevelDebug).Info("Found single MOFED DS", "name", mofedDs.Name) + policy := metav1.DeletePropagationOrphan + err = c.Delete(ctx, mofedDs, &client.DeleteOptions{PropagationPolicy: &policy}) + if err != nil { + log.V(consts.LogLevelError).Error(err, "fail delete single MOFED DS") + return err + } + log.V(consts.LogLevelDebug).Info("Deleted single MOFED DS with orphaned", "name", mofedDs.Name) + return nil + } + log.V(consts.LogLevelDebug).Info("Single MOFED DS not found") + return nil +} diff --git a/pkg/migrate/suite_test.go b/pkg/migrate/suite_test.go index ccbf139a7..2bbc795c1 100644 --- a/pkg/migrate/suite_test.go +++ b/pkg/migrate/suite_test.go @@ -18,6 +18,7 @@ package migrate import ( "context" + "os" "testing" "time" @@ -30,6 +31,8 @@ import ( logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/log/zap" + mellanoxcomv1alpha1 "github.com/Mellanox/network-operator/api/v1alpha1" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -58,9 +61,21 @@ var _ = BeforeSuite(func() { logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) testLog = logf.Log.WithName("test-log").WithName("setup") + err := mellanoxcomv1alpha1.AddToScheme(scheme.Scheme) + Expect(err).NotTo(HaveOccurred()) + + // +kubebuilder:scaffold:scheme + By("bootstrapping test environment") + // Go to project root directory + err = os.Chdir("../..") + Expect(err).NotTo(HaveOccurred()) - testEnv = &envtest.Environment{} + testEnv = &envtest.Environment{ + CRDDirectoryPaths: []string{"config/crd/bases"}, + CRDInstallOptions: envtest.CRDInstallOptions{ErrorIfPathMissing: true}, + ErrorIfCRDPathMissing: true, + } cfg, err := testEnv.Start() Expect(err).NotTo(HaveOccurred()) diff --git a/pkg/nodeinfo/attributes.go b/pkg/nodeinfo/attributes.go index 12890f256..97aa99a90 100644 --- a/pkg/nodeinfo/attributes.go +++ b/pkg/nodeinfo/attributes.go @@ -31,12 +31,16 @@ const ( NodeLabelOSName = "feature.node.kubernetes.io/system-os_release.ID" NodeLabelOSVer = "feature.node.kubernetes.io/system-os_release.VERSION_ID" NodeLabelKernelVerFull = "feature.node.kubernetes.io/kernel-version.full" + NodeLabelKernelVerMajor = "feature.node.kubernetes.io/kernel-version.major" + NodeLabelKernelVerMinor = "feature.node.kubernetes.io/kernel-version.minor" + NodeLabelKernelVerRev = "feature.node.kubernetes.io/kernel-version.revision" NodeLabelHostname = "kubernetes.io/hostname" NodeLabelCPUArch = "kubernetes.io/arch" NodeLabelMlnxNIC = "feature.node.kubernetes.io/pci-15b3.present" NodeLabelNvGPU = "nvidia.com/gpu.present" NodeLabelWaitOFED = "network.nvidia.com/operator.mofed.wait" NodeLabelCudaVersionMajor = "nvidia.com/cuda.driver.major" + NodeLabelOSTreeVersion = "feature.node.kubernetes.io/system-os_release.OSTREE_VERSION" ) type AttributeType int diff --git a/pkg/nodeinfo/node_info.go b/pkg/nodeinfo/node_info.go index 9b67ae54d..5ba5ac743 100644 --- a/pkg/nodeinfo/node_info.go +++ b/pkg/nodeinfo/node_info.go @@ -22,6 +22,8 @@ package nodeinfo */ import ( + "fmt" + corev1 "k8s.io/api/core/v1" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -34,6 +36,7 @@ var MellanoxNICListOptions = []client.ListOption{ type Provider interface { // GetNodesAttributes retrieves node attributes for nodes matching the filter criteria GetNodesAttributes(filters ...Filter) []NodeAttributes + GetNodePools(filters ...Filter) []NodePool } // NewProvider creates a new Provider object @@ -57,3 +60,87 @@ func (p *provider) GetNodesAttributes(filters ...Filter) (attrs []NodeAttributes } return attrs } + +type NodePool struct { + Name string + OsName string + OsVersion string + RhcosVersion string + KernelMajor string + KernelMinor string + Arch string +} + +// GetNodePools partitions nodes into one or more node pools. The list of nodes to partition +// is defined by the filters provided as input. +// +// Nodes are partitioned by osVersion-kernelVersion pair. +func (p *provider) GetNodePools(filters ...Filter) []NodePool { + filtered := p.nodes + for _, filter := range filters { + filtered = filter.Apply(filtered) + } + + nodePoolMap := make(map[string]NodePool) + + for _, node := range filtered { + node := node + nodeLabels := node.GetLabels() + + nodePool := NodePool{} + osName, ok := nodeLabels[NodeLabelOSName] + if !ok { + log.Info("WARNING: Could not find NFD labels for node. Is NFD installed?", "Node", node.Name) + continue + } + nodePool.OsName = osName + + osVersion, ok := nodeLabels[NodeLabelOSVer] + if !ok { + log.Info("WARNING: Could not find NFD labels for node. Is NFD installed?", "Node", node.Name) + continue + } + nodePool.OsVersion = osVersion + + arch, ok := nodeLabels[NodeLabelCPUArch] + if !ok { + log.Info("WARNING: Could not find NFD labels for node. Is NFD installed?", "Node", node.Name) + continue + } + nodePool.Arch = arch + + rhcos, ok := nodeLabels[NodeLabelOSTreeVersion] + if ok { + nodePool.RhcosVersion = rhcos + } + + kernelMajor, ok := nodeLabels[NodeLabelKernelVerMajor] + if !ok { + log.Info("WARNING: Could not find NFD labels for node. Is NFD installed?", "Node", node.Name) + continue + } + nodePool.KernelMajor = kernelMajor + + kernelMinor, ok := nodeLabels[NodeLabelKernelVerMinor] + if !ok { + log.Info("WARNING: Could not find NFD labels for node. Is NFD installed?", "Node", node.Name) + continue + } + nodePool.KernelMinor = kernelMinor + + kernel := fmt.Sprintf("%s.%s", nodePool.KernelMajor, nodePool.KernelMinor) + nodePool.Name = fmt.Sprintf("%s%s-%s", nodePool.OsName, nodePool.OsVersion, kernel) + + if _, exists := nodePoolMap[nodePool.Name]; !exists { + nodePoolMap[nodePool.Name] = nodePool + log.Info("NodePool found", "name", nodePool.Name) + } + } + + nodePools := make([]NodePool, 0) + for _, np := range nodePoolMap { + nodePools = append(nodePools, np) + } + + return nodePools +} diff --git a/pkg/nodeinfo/node_info_test.go b/pkg/nodeinfo/node_info_test.go index 2f22ea73a..f21742749 100644 --- a/pkg/nodeinfo/node_info_test.go +++ b/pkg/nodeinfo/node_info_test.go @@ -23,6 +23,15 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +const ( + testArch = "amd64" + testOsUbuntu = "ubuntu" + testOsRhcos = "rhcos" + testOsVer = "22.04" + testKernelMajor = "5" + testKernelMinor = "15" +) + // A Filter applies a filter on a list of Nodes type dummyFilter struct { called bool @@ -103,4 +112,158 @@ var _ = Describe("nodeinfo Provider tests", func() { Expect(len(attrs)).To(Equal(0)) }) }) + + Context("GetNodePools with filter", func() { + It("Should return an empty list of pools", func() { + filter := &dummyFilter{filtered: []*corev1.Node{}} + provider := NewProvider([]*corev1.Node{ + getNodeWithNfdLabels("Node-1", testOsUbuntu, testOsVer, testKernelMajor, testKernelMinor, testArch), + getNodeWithNfdLabels("Node-2", testOsUbuntu, testOsVer, testKernelMajor, testKernelMinor, testArch), + }) + pools := provider.GetNodePools(filter) + Expect(len(pools)).To(Equal(0)) + }) + It("Should return an empty list of pools, filter by label", func() { + node := getNodeWithNfdLabels("Node-1", testOsUbuntu, testOsVer, testKernelMajor, testKernelMinor, testArch) + delete(node.Labels, NodeLabelMlnxNIC) + provider := NewProvider([]*corev1.Node{node}) + pools := provider.GetNodePools(NewNodeLabelFilterBuilder().WithLabel(NodeLabelMlnxNIC, "true").Build()) + Expect(len(pools)).To(Equal(0)) + }) + }) + + Context("GetNodePools without filter", func() { + It("Should return pool", func() { + provider := NewProvider([]*corev1.Node{ + getNodeWithNfdLabels("Node-1", testOsUbuntu, testOsVer, testKernelMajor, testKernelMinor, testArch), + getNodeWithNfdLabels("Node-2", testOsUbuntu, testOsVer, testKernelMajor, testKernelMinor, testArch), + }) + pools := provider.GetNodePools() + Expect(len(pools)).To(Equal(1)) + Expect(pools[0].Arch).To(Equal(testArch)) + Expect(pools[0].OsName).To(Equal(testOsUbuntu)) + Expect(pools[0].OsVersion).To(Equal(testOsVer)) + Expect(pools[0].KernelMajor).To(Equal(testKernelMajor)) + Expect(pools[0].KernelMinor).To(Equal(testKernelMinor)) + }) + DescribeTable("GetNodePools", + func(nodeList []*corev1.Node, expectedPools int) { + provider := NewProvider(nodeList) + pools := provider.GetNodePools() + Expect(len(pools)).To(Equal(expectedPools)) + }, + Entry("single pool, multiple nodes same NFD labels", []*corev1.Node{ + getNodeWithNfdLabels("Node-1", testOsUbuntu, testOsVer, testKernelMajor, testKernelMinor, testArch), + getNodeWithNfdLabels("Node-2", testOsUbuntu, testOsVer, testKernelMajor, testKernelMinor, testArch), + getNodeWithNfdLabels("Node-3", testOsUbuntu, testOsVer, testKernelMajor, testKernelMinor, testArch), + }, 1), + Entry("2 pools, multiple nodes different OS NFD labels", []*corev1.Node{ + getNodeWithNfdLabels("Node-1", testOsUbuntu, testOsVer, testKernelMajor, testKernelMinor, testArch), + getNodeWithNfdLabels("Node-2", testOsRhcos, testOsVer, testKernelMajor, testKernelMinor, testArch), + getNodeWithNfdLabels("Node-3", testOsRhcos, testOsVer, testKernelMajor, testKernelMinor, testArch), + }, 2), + Entry("2 pools, multiple nodes different OSVer NFD labels", []*corev1.Node{ + getNodeWithNfdLabels("Node-1", testOsUbuntu, testOsVer, testKernelMajor, testKernelMinor, testArch), + getNodeWithNfdLabels("Node-2", testOsUbuntu, "20.04", testKernelMajor, testKernelMinor, testArch), + }, 2), + Entry("2 pools, multiple nodes different KernelMajor NFD labels", []*corev1.Node{ + getNodeWithNfdLabels("Node-1", testOsUbuntu, testOsVer, testKernelMajor, testKernelMinor, testArch), + getNodeWithNfdLabels("Node-2", testOsUbuntu, testOsVer, "6", testKernelMinor, testArch), + }, 2), + Entry("2 pools, multiple nodes different KernelMinor NFD labels", []*corev1.Node{ + getNodeWithNfdLabels("Node-1", testOsUbuntu, testOsVer, testKernelMajor, testKernelMinor, testArch), + getNodeWithNfdLabels("Node-2", testOsUbuntu, testOsVer, testKernelMajor, "18", testArch), + }, 2), + Entry("1 pool, multiple nodes different arch NFD labels", []*corev1.Node{ + getNodeWithNfdLabels("Node-1", testOsUbuntu, testOsVer, testKernelMajor, testKernelMinor, testArch), + getNodeWithNfdLabels("Node-2", testOsUbuntu, testOsVer, testKernelMajor, testKernelMinor, "arm"), + }, 1), + Entry("no pool, node without NFD labels", []*corev1.Node{ + { + ObjectMeta: metav1.ObjectMeta{Name: "Node-1"}, + }, + }, 0), + Entry("no pool, node with missing osName NFD label", []*corev1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "Node-1", + Labels: map[string]string{ + NodeLabelMlnxNIC: "true", + NodeLabelOSVer: testOsVer, + NodeLabelKernelVerMajor: testKernelMajor, + NodeLabelKernelVerMinor: testKernelMinor, + NodeLabelCPUArch: testArch, + }}, + }, + }, 0), + Entry("no pool, node with missing osVer NFD label", []*corev1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "Node-1", + Labels: map[string]string{ + NodeLabelMlnxNIC: "true", + NodeLabelOSName: testOsUbuntu, + NodeLabelKernelVerMajor: testKernelMajor, + NodeLabelKernelVerMinor: testKernelMinor, + NodeLabelCPUArch: testArch, + }}, + }, + }, 0), + Entry("no pool, node with missing arch NFD label", []*corev1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "Node-1", + Labels: map[string]string{ + NodeLabelMlnxNIC: "true", + NodeLabelOSName: testOsUbuntu, + NodeLabelOSVer: testOsVer, + NodeLabelKernelVerMajor: testKernelMajor, + NodeLabelKernelVerMinor: testKernelMinor, + }}, + }, + }, 0), + Entry("no pool, node with missing Kernel Major NFD label", []*corev1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "Node-1", + Labels: map[string]string{ + NodeLabelMlnxNIC: "true", + NodeLabelOSName: testOsUbuntu, + NodeLabelOSVer: testOsVer, + NodeLabelKernelVerMinor: testKernelMinor, + NodeLabelCPUArch: testArch, + }}, + }, + }, 0), + Entry("no pool, node with missing Kernel Minor NFD label", []*corev1.Node{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "Node-1", + Labels: map[string]string{ + NodeLabelMlnxNIC: "true", + NodeLabelOSName: testOsUbuntu, + NodeLabelOSVer: testOsVer, + NodeLabelKernelVerMajor: testKernelMajor, + NodeLabelCPUArch: testArch, + }}, + }, + }, 0), + ) + }) }) + +func getNodeWithNfdLabels(name, osName, osVer, kernelMajor, kernelMinorPool, arch string) *corev1.Node { + return &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: map[string]string{ + NodeLabelMlnxNIC: "true", + NodeLabelOSName: osName, + NodeLabelOSVer: osVer, + NodeLabelKernelVerMajor: kernelMajor, + NodeLabelKernelVerMinor: kernelMinorPool, + NodeLabelCPUArch: arch, + }, + }, + } +} diff --git a/pkg/state/state_ofed.go b/pkg/state/state_ofed.go index 472e0264c..76fc8edec 100644 --- a/pkg/state/state_ofed.go +++ b/pkg/state/state_ofed.go @@ -150,6 +150,8 @@ type ofedRuntimeSpec struct { CPUArch string OSName string OSVer string + KernelMajor string + KernelMinor string MOFEDImageName string InitContainerConfig initContainerConfig // is true if cluster type is Openshift @@ -381,98 +383,126 @@ func (s *stateOFED) getManifestObjects( cr *mellanoxv1alpha1.NicClusterPolicy, nodeInfo nodeinfo.Provider, clusterInfo clustertype.Provider) ([]*unstructured.Unstructured, error) { reqLogger := log.FromContext(ctx) - attrs := nodeInfo.GetNodesAttributes( + nodePools := nodeInfo.GetNodePools( nodeinfo.NewNodeLabelFilterBuilder().WithLabel(nodeinfo.NodeLabelMlnxNIC, "true").Build()) - if len(attrs) == 0 { + if len(nodePools) == 0 { reqLogger.V(consts.LogLevelInfo).Info("No nodes with Mellanox NICs where found in the cluster.") return []*unstructured.Unstructured{}, nil } + objs := make([]*unstructured.Unstructured, 0) + renderedObjsMap := stateObjects{} + for _, np := range nodePools { + nodePool := np + setProbesDefaults(cr) - // TODO: Render daemonset multiple times according to CPUXOS matrix (ATM assume all nodes are the same) - // Note: it is assumed MOFED driver container is able to handle multiple kernel version e.g by triggering DKMS - // if driver was compiled against a missmatching kernel to begin with. - if err := s.checkAttributesExist(attrs[0], - nodeinfo.AttrTypeCPUArch, nodeinfo.AttrTypeOSName, nodeinfo.AttrTypeOSVer); err != nil { - return nil, err - } - nodeAttr := attrs[0].Attributes + // Update MOFED Env variables with defaults for the cluster + cr.Spec.OFEDDriver.Env = s.mergeWithDefaultEnvs(cr.Spec.OFEDDriver.Env, &nodePool) - if cr.Spec.OFEDDriver.StartupProbe == nil { - cr.Spec.OFEDDriver.StartupProbe = &mellanoxv1alpha1.PodProbeSpec{ - InitialDelaySeconds: 10, - PeriodSeconds: 10, + additionalVolMounts := additionalVolumeMounts{} + osname := nodePool.OsName + + // set any custom ssl key/certificate configuration provided + err := s.handleCertConfig(ctx, cr, osname, additionalVolMounts) + if err != nil { + return nil, err } - } - if cr.Spec.OFEDDriver.LivenessProbe == nil { - cr.Spec.OFEDDriver.LivenessProbe = &mellanoxv1alpha1.PodProbeSpec{ - InitialDelaySeconds: 30, - PeriodSeconds: 30, + // set any custom repo configuration provided + err = s.handleRepoConfig(ctx, cr, osname, additionalVolMounts) + if err != nil { + return nil, err } - } - if cr.Spec.OFEDDriver.ReadinessProbe == nil { - cr.Spec.OFEDDriver.ReadinessProbe = &mellanoxv1alpha1.PodProbeSpec{ - InitialDelaySeconds: 10, - PeriodSeconds: 30, + renderData := &ofedManifestRenderData{ + CrSpec: cr.Spec.OFEDDriver, + RuntimeSpec: &ofedRuntimeSpec{ + runtimeSpec: runtimeSpec{config.FromEnv().State.NetworkOperatorResourceNamespace}, + CPUArch: nodePool.Arch, + OSName: nodePool.OsName, + OSVer: nodePool.OsVersion, + KernelMajor: nodePool.KernelMajor, + KernelMinor: nodePool.KernelMinor, + MOFEDImageName: s.getMofedDriverImageName(cr, &nodePool, reqLogger), + InitContainerConfig: s.getInitContainerConfig(cr, reqLogger, + config.FromEnv().State.OFEDState.InitContainerImage), + IsOpenshift: clusterInfo.IsOpenshift(), + }, + Tolerations: cr.Spec.Tolerations, + NodeAffinity: cr.Spec.NodeAffinity, + AdditionalVolumeMounts: additionalVolMounts, + } + // render objects + reqLogger.V(consts.LogLevelDebug).Info("Rendering objects", "data:", renderData) + renderedObjs, err := s.renderer.RenderObjects(&render.TemplatingData{Data: renderData}) + if err != nil { + return nil, errors.Wrap(err, "failed to render objects") + } + for _, o := range renderedObjs { + if !renderedObjsMap.Exist(o.GroupVersionKind(), types.NamespacedName{ + Name: o.GetName(), + Namespace: o.GetNamespace()}) { + renderedObjsMap.Add(o.GroupVersionKind(), types.NamespacedName{Name: o.GetName(), Namespace: o.GetNamespace()}) + objs = append(objs, o) + } } } + reqLogger.V(consts.LogLevelDebug).Info("Rendered", "objects:", objs) + return objs, nil +} - // Update MOFED Env variables with defaults for the cluster - cr.Spec.OFEDDriver.Env = s.mergeWithDefaultEnvs(cr.Spec.OFEDDriver.Env, nodeAttr) - - additionalVolMounts := additionalVolumeMounts{} - osname := nodeAttr[nodeinfo.AttrTypeOSName] - // set any custom ssl key/certificate configuration provided - if cr.Spec.OFEDDriver.CertConfig != nil && cr.Spec.OFEDDriver.CertConfig.Name != "" { - destinationDir, err := getCertConfigPath(osname) +func (s *stateOFED) handleRepoConfig( + ctx context.Context, cr *mellanoxv1alpha1.NicClusterPolicy, osname string, mounts additionalVolumeMounts) error { + if cr.Spec.OFEDDriver.RepoConfig != nil && cr.Spec.OFEDDriver.RepoConfig.Name != "" { + destinationDir, err := getRepoConfigPath(osname) if err != nil { - return nil, fmt.Errorf("failed to get destination directory for custom TLS certificates config: %v", err) + return fmt.Errorf("failed to get destination directory for custom repo config: %v", err) } - err = s.handleAdditionalMounts(ctx, &additionalVolMounts, cr.Spec.OFEDDriver.CertConfig.Name, destinationDir) + err = s.handleAdditionalMounts(ctx, &mounts, cr.Spec.OFEDDriver.RepoConfig.Name, destinationDir) if err != nil { - return nil, fmt.Errorf("failed to mount volumes for custom TLS certificates: %v", err) + return fmt.Errorf("failed to mount volumes for custom repositories configuration: %v", err) } } + return nil +} - // set any custom repo configuration provided - if cr.Spec.OFEDDriver.RepoConfig != nil && cr.Spec.OFEDDriver.RepoConfig.Name != "" { - destinationDir, err := getRepoConfigPath(osname) +func (s *stateOFED) handleCertConfig( + ctx context.Context, cr *mellanoxv1alpha1.NicClusterPolicy, osname string, mounts additionalVolumeMounts) error { + if cr.Spec.OFEDDriver.CertConfig != nil && cr.Spec.OFEDDriver.CertConfig.Name != "" { + destinationDir, err := getCertConfigPath(osname) if err != nil { - return nil, fmt.Errorf("failed to get destination directory for custom repo config: %v", err) + return fmt.Errorf("failed to get destination directory for custom TLS certificates config: %v", err) } - err = s.handleAdditionalMounts(ctx, &additionalVolMounts, cr.Spec.OFEDDriver.RepoConfig.Name, destinationDir) + err = s.handleAdditionalMounts(ctx, &mounts, cr.Spec.OFEDDriver.CertConfig.Name, destinationDir) if err != nil { - return nil, fmt.Errorf("failed to mount volumes for custom repositories configuration: %v", err) + return fmt.Errorf("failed to mount volumes for custom TLS certificates: %v", err) } } + return nil +} - renderData := &ofedManifestRenderData{ - CrSpec: cr.Spec.OFEDDriver, - RuntimeSpec: &ofedRuntimeSpec{ - runtimeSpec: runtimeSpec{config.FromEnv().State.NetworkOperatorResourceNamespace}, - CPUArch: nodeAttr[nodeinfo.AttrTypeCPUArch], - OSName: nodeAttr[nodeinfo.AttrTypeOSName], - OSVer: nodeAttr[nodeinfo.AttrTypeOSVer], - MOFEDImageName: s.getMofedDriverImageName(cr, nodeAttr, reqLogger), - InitContainerConfig: s.getInitContainerConfig(cr, reqLogger, - config.FromEnv().State.OFEDState.InitContainerImage), - IsOpenshift: clusterInfo.IsOpenshift(), - }, - Tolerations: cr.Spec.Tolerations, - NodeAffinity: cr.Spec.NodeAffinity, - AdditionalVolumeMounts: additionalVolMounts, +func setProbesDefaults(cr *mellanoxv1alpha1.NicClusterPolicy) { + if cr.Spec.OFEDDriver.StartupProbe == nil { + cr.Spec.OFEDDriver.StartupProbe = &mellanoxv1alpha1.PodProbeSpec{ + InitialDelaySeconds: 10, + PeriodSeconds: 10, + } } - // render objects - reqLogger.V(consts.LogLevelDebug).Info("Rendering objects", "data:", renderData) - objs, err := s.renderer.RenderObjects(&render.TemplatingData{Data: renderData}) - if err != nil { - return nil, errors.Wrap(err, "failed to render objects") + + if cr.Spec.OFEDDriver.LivenessProbe == nil { + cr.Spec.OFEDDriver.LivenessProbe = &mellanoxv1alpha1.PodProbeSpec{ + InitialDelaySeconds: 30, + PeriodSeconds: 30, + } + } + + if cr.Spec.OFEDDriver.ReadinessProbe == nil { + cr.Spec.OFEDDriver.ReadinessProbe = &mellanoxv1alpha1.PodProbeSpec{ + InitialDelaySeconds: 10, + PeriodSeconds: 30, + } } - reqLogger.V(consts.LogLevelDebug).Info("Rendered", "objects:", objs) - return objs, nil } // prepare configuration for the init container, @@ -502,7 +532,7 @@ func (s *stateOFED) getInitContainerConfig( // getMofedDriverImageName generates MOFED driver image name based on the driver version specified in CR // TODO(adrianc): in Network-Operator v1.5.0, we should just use the new naming scheme func (s *stateOFED) getMofedDriverImageName(cr *mellanoxv1alpha1.NicClusterPolicy, - nodeAttr map[nodeinfo.AttributeType]string, reqLogger logr.Logger) string { + pool *nodeinfo.NodePool, reqLogger logr.Logger) string { curDriverVer, err := semver.NewVersion(cr.Spec.OFEDDriver.Version) if err != nil { reqLogger.V(consts.LogLevelDebug).Info("failed to parse ofed driver version as semver") @@ -513,9 +543,9 @@ func (s *stateOFED) getMofedDriverImageName(cr *mellanoxv1alpha1.NicClusterPolic cr.Spec.OFEDDriver.Repository, cr.Spec.OFEDDriver.Image, cr.Spec.OFEDDriver.Version, - nodeAttr[nodeinfo.AttrTypeOSName], - nodeAttr[nodeinfo.AttrTypeOSVer], - nodeAttr[nodeinfo.AttrTypeCPUArch]) + pool.OsName, + pool.OsVersion, + pool.Arch) } // readOpenshiftProxyConfig reads ClusterWide Proxy configuration for Openshift @@ -648,23 +678,21 @@ func (s *stateOFED) setEnvFromClusterWideProxy(cr *mellanoxv1alpha1.NicClusterPo // mergeWithDefaultEnvs returns env variables provided in currentEnvs merged with default // env variables for MOFED container, taking into account attributes of the k8s cluster. func (s *stateOFED) mergeWithDefaultEnvs( - currentEnvs []v1.EnvVar, nodeAttrs map[nodeinfo.AttributeType]string) []v1.EnvVar { + currentEnvs []v1.EnvVar, pool *nodeinfo.NodePool) []v1.EnvVar { if envVarsWithGet(currentEnvs).Get("CREATE_IFNAMES_UDEV") != nil { // already exists dont overwrite return currentEnvs } // CREATE_IFNAMES_UDEV: should be set to true for ubuntu < 22.04 , RHEL < 9, OCP < 4.13 if not provided - osName := nodeAttrs[nodeinfo.AttrTypeOSName] - osVer := nodeAttrs[nodeinfo.AttrTypeOSVer] createIfnameUdevEnv := v1.EnvVar{Name: "CREATE_IFNAMES_UDEV", Value: "true"} - currVer, err := semver.NewVersion(osVer) + currVer, err := semver.NewVersion(pool.OsVersion) if err != nil { return currentEnvs } - switch osName { + switch pool.OsName { case "ubuntu": if currVer.LessThan(semver.MustParse("22.04")) { currentEnvs = append(currentEnvs, createIfnameUdevEnv) diff --git a/pkg/state/state_ofed_test.go b/pkg/state/state_ofed_test.go index f2f31bf0f..05ea91576 100644 --- a/pkg/state/state_ofed_test.go +++ b/pkg/state/state_ofed_test.go @@ -17,16 +17,25 @@ package state import ( + "context" + "fmt" "strings" + "k8s.io/apimachinery/pkg/runtime" + . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" osconfigv1 "github.com/openshift/api/config/v1" + appsv1 "k8s.io/api/apps/v1" v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "github.com/Mellanox/network-operator/api/v1alpha1" "github.com/Mellanox/network-operator/pkg/nodeinfo" + "github.com/Mellanox/network-operator/pkg/render" + "github.com/Mellanox/network-operator/pkg/testing/mocks" + "github.com/Mellanox/network-operator/pkg/utils" ) const ( @@ -35,20 +44,28 @@ const ( testClusterWideNoProxy = "no-proxy-cluster-wide" testNicPolicyHTTPProxy = "http-policy" testNicPolicyNoProxy = "no-proxy-policy" + osName = "ubuntu" + osVer = "22.04" + kernelMajor = "5" + kernelMinorPool1 = "15" + kernelMinorPool2 = "20" ) var _ = Describe("MOFED state test", func() { var stateOfed stateOFED + var ctx context.Context BeforeEach(func() { stateOfed = stateOFED{} + ctx = context.Background() }) Context("getMofedDriverImageName", func() { - nodeAttr := make(map[nodeinfo.AttributeType]string) - nodeAttr[nodeinfo.AttrTypeCPUArch] = "amd64" - nodeAttr[nodeinfo.AttrTypeOSName] = "ubuntu" - nodeAttr[nodeinfo.AttrTypeOSVer] = "20.04" + nodePool := &nodeinfo.NodePool{ + OsName: "ubuntu", + OsVersion: "20.04", + Arch: "amd64", + } cr := &v1alpha1.NicClusterPolicy{ Spec: v1alpha1.NicClusterPolicySpec{ @@ -63,17 +80,17 @@ var _ = Describe("MOFED state test", func() { It("generates new image format", func() { cr.Spec.OFEDDriver.Version = "5.7-1.0.0.0" - imageName := stateOfed.getMofedDriverImageName(cr, nodeAttr, testLogger) + imageName := stateOfed.getMofedDriverImageName(cr, nodePool, testLogger) Expect(imageName).To(Equal("nvcr.io/mellanox/mofed:5.7-1.0.0.0-ubuntu20.04-amd64")) }) It("generates new image format double digit minor", func() { cr.Spec.OFEDDriver.Version = "5.10-0.0.0.1" - imageName := stateOfed.getMofedDriverImageName(cr, nodeAttr, testLogger) + imageName := stateOfed.getMofedDriverImageName(cr, nodePool, testLogger) Expect(imageName).To(Equal("nvcr.io/mellanox/mofed:5.10-0.0.0.1-ubuntu20.04-amd64")) }) It("return new image format in case of a bad version", func() { cr.Spec.OFEDDriver.Version = "1.1.1.1.1" - imageName := stateOfed.getMofedDriverImageName(cr, nodeAttr, testLogger) + imageName := stateOfed.getMofedDriverImageName(cr, nodePool, testLogger) Expect(imageName).To(Equal("nvcr.io/mellanox/mofed:1.1.1.1.1-ubuntu20.04-amd64")) }) }) @@ -179,11 +196,11 @@ var _ = Describe("MOFED state test", func() { DescribeTable("mergeWithDefaultEnvs", func(osName string, osVer string, currEnvs []v1.EnvVar, expectedEnvs []v1.EnvVar) { - nodeAttr := make(map[nodeinfo.AttributeType]string) - nodeAttr[nodeinfo.AttrTypeOSName] = osName - nodeAttr[nodeinfo.AttrTypeOSVer] = osVer - - mergedEnvs := stateOfed.mergeWithDefaultEnvs(currEnvs, nodeAttr) + nodePool := nodeinfo.NodePool{ + OsName: osName, + OsVersion: osVer, + } + mergedEnvs := stateOfed.mergeWithDefaultEnvs(currEnvs, &nodePool) Expect(mergedEnvs).To(BeEquivalentTo(expectedEnvs)) }, Entry("RHEL 8.6", "rhel", "8.6", []v1.EnvVar{{Name: "Foo", Value: "Bar"}}, @@ -206,4 +223,117 @@ var _ = Describe("MOFED state test", func() { []v1.EnvVar{{Name: "Foo", Value: "Bar"}}), Entry("Ubuntu 23.04", "ubuntu", "23.04", []v1.EnvVar{}, []v1.EnvVar{}), ) + + Context("Render Manifests", func() { + It("Should Render multiple DaemonSet", func() { + client := mocks.ControllerRuntimeClient{} + manifestBaseDir := "../../manifests/state-ofed-driver" + scheme := runtime.NewScheme() + + files, err := utils.GetFilesWithSuffix(manifestBaseDir, render.ManifestFileSuffix...) + Expect(err).NotTo(HaveOccurred()) + renderer := render.NewRenderer(files) + + ofedState := stateOFED{ + stateSkel: stateSkel{ + name: stateOFEDName, + description: stateOFEDDescription, + client: &client, + scheme: scheme, + renderer: renderer, + }, + } + cr := &v1alpha1.NicClusterPolicy{} + cr.Name = "nic-cluster-policy" + cr.Spec.OFEDDriver = &v1alpha1.OFEDDriverSpec{ + ImageSpec: v1alpha1.ImageSpec{ + Image: "mofed", + Repository: "nvcr.io/mellanox", + Version: "23.10-0.5.5.0", + }, + } + By("Creating NodeProvider with 3 Nodes, that form 2 Node pools") + infoProvider := nodeinfo.NewProvider([]*v1.Node{ + getNode("node1", kernelMinorPool1), + getNode("node2", kernelMinorPool2), + getNode("node3", kernelMinorPool2), + }) + objs, err := ofedState.getManifestObjects(ctx, cr, infoProvider, &dummyProvider{}) + Expect(err).NotTo(HaveOccurred()) + // Expect 5 objects: 1 DS per pool, Service Account, Role, RoleBinding + Expect(len(objs)).To(Equal(5)) + By("Verify DaemonSets NodeSelector") + for _, obj := range objs { + if obj.GetKind() != "DaemonSet" { + continue + } + ds := appsv1.DaemonSet{} + err = runtime.DefaultUnstructuredConverter.FromUnstructured(obj.Object, &ds) + Expect(err).NotTo(HaveOccurred()) + if ds.Name == fmt.Sprintf("mofed-%s%s-%s.%s-ds", osName, osVer, kernelMajor, kernelMinorPool1) { + verifyDSNodeSelector(ds.Spec.Template.Spec.NodeSelector, kernelMinorPool1) + } + if ds.Name == fmt.Sprintf("mofed-%s%s-%s.%s-ds", osName, osVer, kernelMajor, kernelMinorPool2) { + verifyDSNodeSelector(ds.Spec.Template.Spec.NodeSelector, kernelMinorPool2) + } + verifyPodAntiInfinity(ds.Spec.Template.Spec.Affinity) + } + }) + }) }) + +func verifyPodAntiInfinity(affinity *v1.Affinity) { + Expect(affinity).NotTo(BeNil()) + expected := v1.Affinity{ + PodAntiAffinity: &v1.PodAntiAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{ + { + LabelSelector: &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "nvidia.com/ofed-driver", + Operator: metav1.LabelSelectorOpExists, + }, + }, + }, + TopologyKey: "kubernetes.io/hostname", + }, + }, + }, + } + Expect(*affinity).To(BeEquivalentTo(expected)) +} + +func verifyDSNodeSelector(selector map[string]string, kernelMinor string) { + nsMellanox, ok := selector["feature.node.kubernetes.io/pci-15b3.present"] + Expect(ok).To(BeTrue()) + Expect(nsMellanox).To(Equal("true")) + nsOsName, ok := selector["feature.node.kubernetes.io/system-os_release.ID"] + Expect(ok).To(BeTrue()) + Expect(nsOsName).To(Equal(osName)) + nsOsVer, ok := selector["feature.node.kubernetes.io/system-os_release.VERSION_ID"] + Expect(ok).To(BeTrue()) + Expect(nsOsVer).To(Equal(osVer)) + nsKernelMajor, ok := selector["feature.node.kubernetes.io/kernel-version.major"] + Expect(ok).To(BeTrue()) + Expect(nsKernelMajor).To(Equal(kernelMajor)) + nsKernelMinor, ok := selector["feature.node.kubernetes.io/kernel-version.minor"] + Expect(ok).To(BeTrue()) + Expect(nsKernelMinor).To(Equal(kernelMinor)) +} + +func getNode(name, kernelMinorPool string) *v1.Node { + return &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Labels: map[string]string{ + nodeinfo.NodeLabelMlnxNIC: "true", + nodeinfo.NodeLabelOSName: osName, + nodeinfo.NodeLabelOSVer: osVer, + nodeinfo.NodeLabelKernelVerMajor: kernelMajor, + nodeinfo.NodeLabelKernelVerMinor: kernelMinorPool, + nodeinfo.NodeLabelCPUArch: "amd64", + }, + }, + } +} diff --git a/pkg/state/state_skel.go b/pkg/state/state_skel.go index b943e42bf..1b9fa1933 100644 --- a/pkg/state/state_skel.go +++ b/pkg/state/state_skel.go @@ -19,7 +19,6 @@ package state import ( "context" "encoding/json" - "fmt" "github.com/go-logr/logr" "github.com/pkg/errors" @@ -34,7 +33,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log" "github.com/Mellanox/network-operator/pkg/consts" - "github.com/Mellanox/network-operator/pkg/nodeinfo" "github.com/Mellanox/network-operator/pkg/render" "github.com/Mellanox/network-operator/pkg/revision" ) @@ -464,13 +462,3 @@ func (s *stateSkel) isDaemonSetReady(uds *unstructured.Unstructured, reqLogger l } return false, nil } - -// Check if provided attrTypes are present in NodeAttributes.Attributes -func (s *stateSkel) checkAttributesExist(attrs nodeinfo.NodeAttributes, attrTypes ...nodeinfo.AttributeType) error { - for _, t := range attrTypes { - if _, ok := attrs.Attributes[t]; !ok { - return fmt.Errorf("mandatory node attribute does not exist for node %s", attrs.Name) - } - } - return nil -}