Skip to content

Commit

Permalink
Backport "Update AMD Operator and NFD install scripts (#2139)" in rel…
Browse files Browse the repository at this point in the history
…eases/2.10.0 (#2171)

* Update AMD Operator and NFD install scripts (#2139)

* workaround for amd certified operator in ocp < 4.16

* add NFD installation script and use it in AMD script

* use NFD install script in NVIDIA script

* minor change

* update warn msg

* rm unused function

* fix conflict
  • Loading branch information
bdattoma authored Jan 13, 2025
1 parent c07970f commit e2c506e
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 9 deletions.
31 changes: 27 additions & 4 deletions ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_operator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,32 @@ EOF
fi
}

function applyWorkaroundForOlderOCPVersions () {
# workaround for OCP versions less than 4.16
# AMD certified operator is published starting from OCP v4.16
ocpVersion=$(oc version --output json | jq '.openshiftVersion' | tr -d '"')
IFS='.' read -ra ocpVersionSplit <<< "$ocpVersion"
if [ "${ocpVersionSplit[1]}" -lt 16 ]; then
echo "OCP Version: $ocpVersion"
echo "AMD Operator is not available for versions < 4.16, hence creating custom catalog source as workaround"
oc apply -f - <<EOF
apiVersion: operators.coreos.com/v1alpha1
kind: CatalogSource
metadata:
name: certified-operators-416-amd
namespace: openshift-marketplace
spec:
displayName: Certfied operator
image: 'registry.redhat.io/redhat/certified-operator-index:v4.16'
publisher: RHOAI QE
sourceType: grpc
EOF
oc wait --timeout="120s" --for=condition=ready=true pod -n openshift-marketplace -l olm.catalogSource=certified-operators-416-amd
sed -i'' -e "s/certified-operators/certified-operators-416-amd/g" "$GPU_INSTALL_DIR/amd_gpu_install.yaml"
fi
}

applyWorkaroundForOlderOCPVersions
check_registry
status=$?

Expand All @@ -170,10 +196,7 @@ fi
sleep 120
wait_while 1800 ! machineconfig_updates

echo "Installing NFD operator"
oc apply -f "$GPU_INSTALL_DIR/../nfd_operator.yaml"
wait_while 360 ! has_csv_succeeded openshift-nfd nfd
oc apply -f "$GPU_INSTALL_DIR/../nfd_deploy.yaml"
/bin/bash tasks/Resources/Provisioning/GPU/NFD/install_nfd.sh
echo "Installing KMM operator"
oc apply -f "$GPU_INSTALL_DIR/kmm_operator_install.yaml"
wait_while 360 ! has_csv_succeeded openshift-kmm kernel-module-management
Expand Down
29 changes: 29 additions & 0 deletions ods_ci/tasks/Resources/Provisioning/GPU/NFD/install_nfd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/bin/bash
set -e

NFD_INSTALL_DIR="$(dirname "$0")"
NFD_INSTANCE=$NFD_INSTALL_DIR/nfd_deploy.yaml
echo "Installing NFD operator"
oc apply -f "$NFD_INSTALL_DIR/nfd_operator.yaml"
oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n openshift-nfd sub nfd

ocpVersion=$(oc version --output json | jq '.openshiftVersion' | tr -d '"')
IFS='.' read -ra ocpVersionSplit <<< "$ocpVersion"
xyVersion="${ocpVersionSplit[0]}.${ocpVersionSplit[1]}"
declare -A images=(
["4.14"]="registry.redhat.io\/openshift4\/ose-node-feature-discovery@sha256:2977e67a413882efbfb90b52facf65d38a5cb2cd7a232ca3a69476e5dec33319"
["4.15"]="registry.redhat.io\/openshift4\/ose-node-feature-discovery-rhel9@sha256:661b6697dee34626a3a98b50cdba787402ab214d2807b8460df92e3c79cdfcc5"
["4.16"]="registry.redhat.io\/openshift4\/ose-node-feature-discovery-rhel9@sha256:bb95bc317ab78e8af4ef34dd66f9f62c2f8c261dfb5eab40918142812802f8b7"
["4.17"]="registry.redhat.io\/openshift4\/ose-node-feature-discovery-rhel9@sha256:154cf3f1ddaf895d7ecd04947bd455a930132f72acc6e8bde8c26bc123184ace"
# 4.18 is a pre-release image. We need to update it later
["4.18"]="registry.redhat.io\/openshift4\/ose-node-feature-discovery-rhel9@sha256:510cb4351253492455664b6c323f54dc2f6f2f8791c5e92ba6b7e60b8adb357c"
)
if [ "${images[$xyVersion]}" ]; then
imageUrl="${images[$xyVersion]}"
echo "Using image SHA for $xyVersion: $imageUrl"
else
imageUrl="${images["4.17"]}"
echo "WARNING: I don't know the sha for $xyVersion. Re-using default 4.17 $imageUrl. It might not work!"
fi
sed -i'' -e "s/<imageUrl>/$imageUrl/g" $NFD_INSTANCE
oc apply -f "$NFD_INSTANCE"
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ spec:
instance: "" # instance is empty by default
topologyupdater: false # False by default
operand:
image: registry.redhat.io/openshift4/ose-node-feature-discovery:v4.11
# Image URL example: registry.redhat.io/openshift4/ose-node-feature-discovery:v4.11
image: <imageUrl>
imagePullPolicy: Always
workerConfig:
configData: |
Expand Down
6 changes: 2 additions & 4 deletions ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@ CSVNAME="$(oc get packagemanifests/gpu-operator-certified -n openshift-marketpla
sed -i'' -e "0,/v1.11/s//$CHANNEL/g" -e "s/gpu-operator-certified.v1.11.0/$CSVNAME/g" "$GPU_INSTALL_DIR/gpu_install.yaml"

oc apply -f "$GPU_INSTALL_DIR/gpu_install.yaml"
oc apply -f "$GPU_INSTALL_DIR/../nfd_operator.yaml"
echo "Wait for Nvidia GPU Operator Subscription, InstallPlan and Deployment to complete"
/bin/bash tasks/Resources/Provisioning/GPU/NFD/install_nfd.sh

oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n openshift-nfd sub nfd
echo "Wait for Nvidia GPU Operator Subscription, InstallPlan and Deployment to complete"

oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n nvidia-gpu-operator sub gpu-operator-certified

Expand Down Expand Up @@ -80,7 +79,6 @@ function rerun_accelerator_migration() {
}

wait_until_pod_ready_status "gpu-operator"
oc apply -f "$GPU_INSTALL_DIR/../nfd_deploy.yaml"
oc get csv -n nvidia-gpu-operator "$CSVNAME" -o jsonpath='{.metadata.annotations.alm-examples}' | jq .[0] > clusterpolicy.json
oc apply -f clusterpolicy.json
wait_until_pod_ready_status "nvidia-device-plugin-daemonset"
Expand Down

0 comments on commit e2c506e

Please sign in to comment.