Skip to content

Commit 36651c4

Browse files
authored
Backport GPU provisioning updates to releases/2.10.0 (red-hat-data-services#1578)
Generalize GPU provisioning script to ease the addition of new providers (red-hat-data-services#1568) * refactor gpu operators directories * generalize GPU script away from AWS provider * minor changes in kustomize yaml * fix aws gpu overlay * fix filepath * add missing end lines
1 parent 186d7f9 commit 36651c4

File tree

13 files changed

+44
-20
lines changed

13 files changed

+44
-20
lines changed

ods_ci/tasks/Resources/Provisioning/GPU/gpu_deploy.sh renamed to ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ CSVNAME="$(oc get packagemanifests/gpu-operator-certified -n openshift-marketpla
1212
sed -i'' -e "0,/v1.11/s//$CHANNEL/g" -e "s/gpu-operator-certified.v1.11.0/$CSVNAME/g" "$GPU_INSTALL_DIR/gpu_install.yaml"
1313

1414
oc apply -f "$GPU_INSTALL_DIR/gpu_install.yaml"
15-
oc apply -f "$GPU_INSTALL_DIR/nfd_operator.yaml"
15+
oc apply -f "$GPU_INSTALL_DIR/../nfd_operator.yaml"
1616
echo "Wait for Nvidia GPU Operator Subscription, InstallPlan and Deployment to complete"
1717

1818
oc wait --timeout=3m --for jsonpath='{.status.state}'=AtLatestKnown -n openshift-nfd sub nfd
@@ -80,7 +80,7 @@ function rerun_accelerator_migration() {
8080
}
8181

8282
wait_until_pod_ready_status "gpu-operator"
83-
oc apply -f "$GPU_INSTALL_DIR/nfd_deploy.yaml"
83+
oc apply -f "$GPU_INSTALL_DIR/../nfd_deploy.yaml"
8484
oc get csv -n nvidia-gpu-operator "$CSVNAME" -o jsonpath='{.metadata.annotations.alm-examples}' | jq .[0] > clusterpolicy.json
8585
oc apply -f clusterpolicy.json
8686
wait_until_pod_ready_status "nvidia-device-plugin-daemonset"

ods_ci/tasks/Resources/Provisioning/Hive/AWS/provision-gpu.sh renamed to ods_ci/tasks/Resources/Provisioning/GPU/provision-gpu.sh

Lines changed: 11 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@ set -e
33

44
# Optional params
55
INSTANCE_TYPE=${1:-"g4dn.xlarge"}
6+
PROVIDER=${2:-"AWS"}
7+
KUSTOMIZE_PATH="$PWD/tasks/Resources/Provisioning/Hive/GPU"
8+
MACHINESET_PATH="$KUSTOMIZE_PATH/base/source-machineset.yaml"
69

710
# Check if existing machineset GPU already exists
811
EXISTING_GPU_MACHINESET="$(oc get machineset -n openshift-machine-api -o jsonpath="{.items[?(@.metadata.annotations['machine\.openshift\.io/GPU']>'0')].metadata.name}")"
@@ -14,28 +17,18 @@ fi
1417

1518
# Select the first machineset as a template for the GPU machineset
1619
SOURCE_MACHINESET=$(oc get machineset -n openshift-machine-api -o name | head -n1)
20+
oc get -o yaml -n openshift-machine-api $SOURCE_MACHINESET > $MACHINESET_PATH
1721

18-
# Reformat with jq, for better diff result.
19-
oc get -o json -n openshift-machine-api $SOURCE_MACHINESET | jq -r > /tmp/source-machineset.json
20-
21-
OLD_MACHINESET_NAME=$(jq '.metadata.name' -r /tmp/source-machineset.json )
22+
# rename machine set in the template file
23+
OLD_MACHINESET_NAME=$(yq '.metadata.name' $MACHINESET_PATH )
2224
NEW_MACHINESET_NAME=${OLD_MACHINESET_NAME/worker/gpu}
25+
sed -i'' -e "s/$OLD_MACHINESET_NAME/$NEW_MACHINESET_NAME/g" $MACHINESET_PATH
2326

27+
# set the desired node flavor in the kustomize overlay
28+
sed -i'' -e "s/INSTANCE_TYPE/$INSTANCE_TYPE/g" $KUSTOMIZE_PATH/overlays/$PROVIDER/gpu.yaml
2429

25-
# Change instanceType and delete some stuff
26-
jq -r --arg INSTANCE_TYPE "$INSTANCE_TYPE" '.spec.template.spec.providerSpec.value.instanceType=$INSTANCE_TYPE
27-
| del(.metadata.selfLink)
28-
| del(.metadata.uid)
29-
| del(.metadata.creationTimestamp)
30-
| del(.metadata.resourceVersion)
31-
' /tmp/source-machineset.json > /tmp/gpu-machineset.json
32-
33-
# Change machineset name
34-
sed -i'' -e "s/$OLD_MACHINESET_NAME/$NEW_MACHINESET_NAME/g" /tmp/gpu-machineset.json
35-
# Create new machineset
36-
oc apply -f /tmp/gpu-machineset.json
37-
rm /tmp/source-machineset.json
38-
rm /tmp/gpu-machineset.json
30+
# create the new MachineSet using kustomize
31+
oc apply --kustomize $KUSTOMIZE_PATH/overlays/$PROVIDER
3932

4033
# Add GPU label to the new machine-set
4134
oc patch machinesets -n openshift-machine-api "$NEW_MACHINESET_NAME" -p '{"metadata":{"labels":{"gpu-machineset":"true"}}}' --type=merge
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
apiVersion: kustomize.config.k8s.io/v1beta1
2+
kind: Kustomization
3+
resources:
4+
- source-machineset.yaml
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# PLACEHOLDER - the content is dynamically generated by ods_ci/tasks/Resources/Provisioning/GPU/provision-gpu.sh
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
- op: replace
2+
path: /spec/template/spec/providerSpec/value/instanceType
3+
value: INSTANCE_TYPE
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
apiVersion: kustomize.config.k8s.io/v1beta1
2+
kind: Kustomization
3+
metadata:
4+
name: add-gpu
5+
resources:
6+
- ../../base/
7+
patches:
8+
- path: gpu.yaml
9+
target:
10+
kind: MachineSet
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
- op: replace
2+
path: /spec/template/spec/providerSpec/value/profile
3+
value: INSTANCE_TYPE
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
apiVersion: kustomize.config.k8s.io/v1beta1
2+
kind: Kustomization
3+
metadata:
4+
name: add-gpu
5+
resources:
6+
- ../../base/
7+
patches:
8+
- path: gpu.yaml
9+
target:
10+
kind: MachineSet

0 commit comments

Comments
 (0)