Skip to content

Commit

Permalink
feat: Support OFED with DTK
Browse files Browse the repository at this point in the history
In Openshift, in order to OFED containter to be able
to download and compile the needed Kernel files, it is
required to install a cluster-wide entitlement.

This requirement is not user friendly.

In order to avoid this, a container image with the needed files
is available in Openshift distributions.
This image is called DriverToolKit aka DTK.

By using this container as a side-car to MOFED container,
the modules can be compiled without entitlement.

Changes:
API:
 - NicClusterPolicy CRD: add bool 'useOcpDriverToolkit'
   under 'ofedDriver'. Default is 'true'
 - Helm : add support to 'useOcpDriverToolkit'

OFED state:
 - In case of OCP and 'useOcpDriverToolkit' is true,
   find DTK image based on NFD label of node.
 - If available, add to MOFED DS a DTK container,
   change entrypoint logic.

Signed-off-by: Fred Rolland <frolland@nvidia.com>
  • Loading branch information
rollandf committed Dec 20, 2023
1 parent 31f75e1 commit 2c6b188
Show file tree
Hide file tree
Showing 15 changed files with 252 additions and 3 deletions.
3 changes: 3 additions & 0 deletions api/v1alpha1/nicclusterpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ type OFEDDriverSpec struct {
// +kubebuilder:default:=300
// +kubebuilder:validation:Minimum:=0
TerminationGracePeriodSeconds int64 `json:"terminationGracePeriodSeconds,omitempty"`
// UseOCPDriverToolkit indicates if DriverToolkit image should be used on OpenShift to build and install driver modules
// +kubebuilder:default:=true
UseOCPDriverToolkit bool `json:"useOcpDriverToolkit"`
}

// DriverUpgradePolicySpec describes policy configuration for automatic upgrades
Expand Down
6 changes: 6 additions & 0 deletions config/crd/bases/mellanox.com_nicclusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -576,12 +576,18 @@ spec:
type: integer
type: object
type: object
useOcpDriverToolkit:
default: true
description: UseOCPDriverToolkit indicates if DriverToolkit image
should be used on OpenShift to build and install driver modules
type: boolean
version:
pattern: '[a-zA-Z0-9\.-]+'
type: string
required:
- image
- repository
- useOcpDriverToolkit
- version
type: object
rdmaSharedDevicePlugin:
Expand Down
8 changes: 8 additions & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,14 @@ rules:
- create
- patch
- update
- apiGroups:
- image.openshift.io
resources:
- imagestreams
verbs:
- get
- list
- watch
- apiGroups:
- k8s.cni.cncf.io
resources:
Expand Down
1 change: 1 addition & 0 deletions controllers/nicclusterpolicy_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ type NicClusterPolicyReconciler struct {
// +kubebuilder:rbac:groups=nv-ipam.nvidia.com,resources=ippools/status,verbs=get;update;patch;
// +kubebuilder:rbac:groups=cert-manager.io,resources=issuers;certificates,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=admissionregistration.k8s.io,resources=validatingwebhookconfigurations,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=image.openshift.io,resources=imagestreams,verbs=get;list;watch

// Reconcile is part of the main kubernetes reconciliation loop which aims to
// move the current state of the cluster closer to the desired state.
Expand Down
1 change: 1 addition & 0 deletions deployment/network-operator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,7 @@ imagePullSecrets:
| `ofedDriver.upgradePolicy.drain.deleteEmptyDir` | bool | `true` | continue even if there are pods using emptyDir |
| `ofedDriver.upgradePolicy.waitForCompletion.podSelector` | string | not set | specifies a label selector for the pods to wait for completion before starting the driver upgrade |
| `ofedDriver.upgradePolicy.waitForCompletion.timeoutSeconds` | int | not set | specify the length of time in seconds to wait before giving up for workload to finish, zero means infinite |
| `ofedDriver.useOcpDriverToolkit` | bool | `true` | In OpenShift, use Driver Toolkit image to compile OFED drivers |

#### RDMA Device Plugin

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -576,12 +576,18 @@ spec:
type: integer
type: object
type: object
useOcpDriverToolkit:
default: true
description: UseOCPDriverToolkit indicates if DriverToolkit image
should be used on OpenShift to build and install driver modules
type: boolean
version:
pattern: '[a-zA-Z0-9\.-]+'
type: string
required:
- image
- repository
- useOcpDriverToolkit
- version
type: object
rdmaSharedDevicePlugin:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ spec:
image: {{ .Values.ofedDriver.image }}
repository: {{ .Values.ofedDriver.repository }}
version: {{ .Values.ofedDriver.version }}
useOcpDriverToolkit: {{ .Values.ofedDriver.useOcpDriverToolkit }}
{{- if .Values.ofedDriver.env }}
env:
{{ toYaml .Values.ofedDriver.env | nindent 6 }}
Expand Down
8 changes: 8 additions & 0 deletions deployment/network-operator/templates/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,14 @@ rules:
- create
- patch
- update
- apiGroups:
- image.openshift.io
resources:
- imagestreams
verbs:
- get
- list
- watch
- apiGroups:
- k8s.cni.cncf.io
resources:
Expand Down
1 change: 1 addition & 0 deletions deployment/network-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ ofedDriver:
# podSelector: "app=myapp"
# specify the length of time in seconds to wait before giving up for workload to finish, zero means infinite
# timeoutSeconds: 300
useOcpDriverToolkit: true

rdmaSharedDevicePlugin:
deploy: true
Expand Down
1 change: 1 addition & 0 deletions hack/templates/values/values.template
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ ofedDriver:
# podSelector: "app=myapp"
# specify the length of time in seconds to wait before giving up for workload to finish, zero means infinite
# timeoutSeconds: 300
useOcpDriverToolkit: true

rdmaSharedDevicePlugin:
deploy: true
Expand Down
2 changes: 2 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"github.com/NVIDIA/k8s-operator-libs/pkg/upgrade"
netattdefv1 "github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/apis/k8s.cni.cncf.io/v1"
osconfigv1 "github.com/openshift/api/config/v1"
imagev1 "github.com/openshift/api/image/v1"
"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
Expand Down Expand Up @@ -57,6 +58,7 @@ func init() {
utilruntime.Must(mellanoxcomv1alpha1.AddToScheme(scheme))
utilruntime.Must(netattdefv1.AddToScheme(scheme))
utilruntime.Must(osconfigv1.AddToScheme(scheme))
utilruntime.Must(imagev1.AddToScheme(scheme))
// +kubebuilder:scaffold:scheme
}

Expand Down
56 changes: 56 additions & 0 deletions manifests/state-ofed-driver/0050_ofed-driver-ds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@ spec:
- image: {{ .RuntimeSpec.MOFEDImageName }}
imagePullPolicy: IfNotPresent
name: mofed-container
{{- if .RuntimeSpec.UseDtk }}
command: ["ocp_dtk_entrypoint"]
args: ["nv-fs-ctr-run-with-dtk"]
{{- end }}
securityContext:
privileged: true
seLinuxOptions:
Expand Down Expand Up @@ -112,6 +116,10 @@ spec:
readOnly: {{ .ReadOnly }}
{{- end }}
{{- end }}
{{- if .RuntimeSpec.UseDtk }}
- name: shared-nvidia-driver-toolkit
mountPath: /mnt/shared-nvidia-driver-toolkit
{{- end}}
startupProbe:
exec:
command:
Expand All @@ -135,6 +143,47 @@ spec:
initialDelaySeconds: {{ .CrSpec.ReadinessProbe.InitialDelaySeconds }}
failureThreshold: 1
periodSeconds: {{ .CrSpec.ReadinessProbe.PeriodSeconds }}
{{- if .RuntimeSpec.UseDtk }}
- image: {{ .RuntimeSpec.DtkImageName }}
imagePullPolicy: IfNotPresent
name: openshift-driver-toolkit-ctr
command: [bash, -xc]
args: ["until [ -f /mnt/shared-nvidia-driver-toolkit/dir_prepared ]; do echo Waiting for nvidia-driver-ctr container to prepare the shared directory ...; sleep 10; done; exec /mnt/shared-nvidia-driver-toolkit/ocp_dtk_entrypoint dtk-build-driver"]
securityContext:
privileged: true
seLinuxOptions:
level: "s0"
env:
{{- if .CrSpec.Env }}
{{- range .CrSpec.Env }}
{{ . | yaml | nindentPrefix 14 "- " }}
{{- end }}
{{- end }}
volumeMounts:
- name: run-mlnx-ofed
mountPath: /run/mellanox/drivers
mountPropagation: Bidirectional
- name: etc-network
mountPath: /etc/network
- name: host-etc
mountPath: /host/etc
- name: host-usr
mountPath: /host/usr
- name: host-udev
mountPath: /host/lib/udev
- name: host-lib-modules
mountPath: /host/lib/modules
{{- if.AdditionalVolumeMounts.VolumeMounts }}
{{- range .AdditionalVolumeMounts.VolumeMounts }}
- name: {{ .Name }}
mountPath: {{ .MountPath }}
subPath: {{ .SubPath }}
readOnly: {{ .ReadOnly }}
{{- end }}
{{- end }}
- name: shared-nvidia-driver-toolkit
mountPath: /mnt/shared-nvidia-driver-toolkit
{{- end }}
# unloading OFED modules can take more time than default terminationGracePeriod (30 sec)
terminationGracePeriodSeconds: {{ .CrSpec.TerminationGracePeriodSeconds }}
volumes:
Expand Down Expand Up @@ -165,10 +214,17 @@ spec:
{{ . | yaml | nindentPrefix 14 "- " }}
{{- end }}
{{- end }}
{{- if .RuntimeSpec.UseDtk }}
- name: shared-nvidia-driver-toolkit
emptyDir: {}
{{- end }}
nodeSelector:
feature.node.kubernetes.io/pci-15b3.present: "true"
feature.node.kubernetes.io/system-os_release.ID: {{ .RuntimeSpec.OSName }}
feature.node.kubernetes.io/system-os_release.VERSION_ID: "{{ .RuntimeSpec.OSVer }}"
{{- if .RuntimeSpec.UseDtk }}
feature.node.kubernetes.io/system-os_release.OSTREE_VERSION: "{{ .RuntimeSpec.RhcosVersion }}"
{{- end }}
{{- if .NodeAffinity }}
affinity:
nodeAffinity:
Expand Down
4 changes: 4 additions & 0 deletions pkg/nodeinfo/attributes.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ const (
NodeLabelNvGPU = "nvidia.com/gpu.present"
NodeLabelWaitOFED = "network.nvidia.com/operator.mofed.wait"
NodeLabelCudaVersionMajor = "nvidia.com/cuda.driver.major"
NodeLabelOSTreeVersion = "feature.node.kubernetes.io/system-os_release.OSTREE_VERSION"
)

type AttributeType int
Expand All @@ -50,6 +51,7 @@ const (
AttrTypeOSVer
// optional attrs
AttrTypeCudaVersionMajor
AttrTypeOSTreeVersion

OptionalAttrsStart = AttrTypeCudaVersionMajor
)
Expand All @@ -65,6 +67,8 @@ var attrToLabel = []string{
NodeLabelOSVer,
// AttrTypeCudaVersionMajor
NodeLabelCudaVersionMajor,
// AttrTypeOSTreeVersion
NodeLabelOSTreeVersion,
}

// NodeAttributes provides attributes of a specific node
Expand Down
50 changes: 48 additions & 2 deletions pkg/state/state_ofed.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
"github.com/NVIDIA/k8s-operator-libs/pkg/upgrade"
"github.com/go-logr/logr"
osconfigv1 "github.com/openshift/api/config/v1"
apiimagev1 "github.com/openshift/api/image/v1"
"github.com/pkg/errors"
appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -153,7 +154,10 @@ type ofedRuntimeSpec struct {
MOFEDImageName string
InitContainerConfig initContainerConfig
// is true if cluster type is Openshift
IsOpenshift bool
IsOpenshift bool
UseDtk bool
DtkImageName string
RhcosVersion string
}

type ofedManifestRenderData struct {
Expand Down Expand Up @@ -397,6 +401,21 @@ func (s *stateOFED) getManifestObjects(
}
nodeAttr := attrs[0].Attributes

useDtk := clusterInfo.IsOpenshift() && cr.Spec.OFEDDriver.UseOCPDriverToolkit
var dtkImageName string
var rhcosVersion string
if useDtk {
if err := s.checkAttributesExist(attrs[0], nodeinfo.AttrTypeOSTreeVersion); err != nil {
return nil, err
}
rhcosVersion = nodeAttr[nodeinfo.AttrTypeOSTreeVersion]
dtk, err := s.ocpDriverToolkitImage(ctx, nodeAttr[nodeinfo.AttrTypeOSTreeVersion])
if err != nil {
return nil, fmt.Errorf("failed to get OpenShift DTK image : %v", err)
}
dtkImageName = dtk
}

setProbesDefaults(cr)

// Update MOFED Env variables with defaults for the cluster
Expand Down Expand Up @@ -426,7 +445,10 @@ func (s *stateOFED) getManifestObjects(
MOFEDImageName: s.getMofedDriverImageName(cr, nodeAttr, reqLogger),
InitContainerConfig: s.getInitContainerConfig(cr, reqLogger,
config.FromEnv().State.OFEDState.InitContainerImage),
IsOpenshift: clusterInfo.IsOpenshift(),
IsOpenshift: clusterInfo.IsOpenshift(),
UseDtk: useDtk,
DtkImageName: dtkImageName,
RhcosVersion: rhcosVersion,
},
Tolerations: cr.Spec.Tolerations,
NodeAffinity: cr.Spec.NodeAffinity,
Expand Down Expand Up @@ -721,3 +743,27 @@ func (s *stateOFED) handleRepoConfig(
}
return nil
}

// ocpDriverToolkitImage gets the DTK ImageStream and return the DTK image according to RHCOS version
func (s *stateOFED) ocpDriverToolkitImage(ctx context.Context, rhcosVersion string) (string, error) {
reqLogger := log.FromContext(ctx)
dtkImageStream := &apiimagev1.ImageStream{}
name := "driver-toolkit"
namespace := "openshift"
err := s.client.Get(ctx, types.NamespacedName{Namespace: namespace, Name: name}, dtkImageStream)
if err != nil {
reqLogger.Error(err, "Couldn't get the driver-toolkit imagestream")
return "", err
}
rhcosDriverToolkitImages := make(map[string]string)
reqLogger.Info("ocpDriverToolkitImages: driver-toolkit imagestream found")
for _, tag := range dtkImageStream.Spec.Tags {
rhcosDriverToolkitImages[tag.Name] = tag.From.Name
}

image, ok := rhcosDriverToolkitImages[rhcosVersion]
if !ok {
return "", fmt.Errorf("failed to find DTK image for RHCOS version: %v", rhcosVersion)
}
return image, nil
}
Loading

0 comments on commit 2c6b188

Please sign in to comment.