forked from red-hat-data-services/ods-ci
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Backport GPU provisioning updates to releases/2.8.0 (red-hat-data-ser…
…vices#1581) Generalize GPU provisioning script to ease the addition of new providers (red-hat-data-services#1568) * refactor gpu operators directories * generalize GPU script away from AWS provider * minor changes in kustomize yaml * fix aws gpu overlay * fix filepath * add missing end lines
- Loading branch information
Showing
14 changed files
with
336 additions
and
46 deletions.
There are no files selected for viewing
28 changes: 28 additions & 0 deletions
28
ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_gpu_install.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
apiVersion: v1 | ||
kind: Namespace | ||
metadata: | ||
name: openshift-amd-gpu | ||
|
||
--- | ||
|
||
apiVersion: operators.coreos.com/v1 | ||
kind: OperatorGroup | ||
metadata: | ||
name: openshift-amd-gpu-operator-group | ||
namespace: openshift-amd-gpu | ||
spec: {} | ||
|
||
--- | ||
|
||
apiVersion: operators.coreos.com/v1alpha1 | ||
kind: Subscription | ||
metadata: | ||
name: amd-gpu-operator | ||
namespace: openshift-amd-gpu | ||
spec: | ||
channel: alpha | ||
installPlanApproval: Automatic | ||
name: amd-gpu-operator | ||
source: community-operators | ||
sourceNamespace: openshift-marketplace | ||
|
196 changes: 196 additions & 0 deletions
196
ods_ci/tasks/Resources/Provisioning/GPU/AMD/amd_operator.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,196 @@ | ||
#!/bin/bash | ||
set -e | ||
|
||
GPU_INSTALL_DIR="$(dirname "$0")" | ||
|
||
function create_registry_network() { | ||
oc patch configs.imageregistry.operator.openshift.io cluster --type merge --patch '{"spec":{"storage":{"emptyDir":{}}}}' | ||
oc patch configs.imageregistry.operator.openshift.io cluster --type merge --patch '{"spec":{"managementState":"Managed"}}' | ||
echo "Internal registry network created." | ||
} | ||
|
||
function check_registry() { | ||
registry_pod=$(oc get pod -l docker-registry=default -n openshift-image-registry --no-headers -o custom-columns=":metadata.name") | ||
if [ -n "$registry_pod" ]; then | ||
echo "Internal registry pod ($registry_pod) is present." | ||
return 0 # Success | ||
else | ||
echo "Internal registry pod is not present." | ||
create_registry_network | ||
return 1 # Failure | ||
fi | ||
} | ||
function wait_while { | ||
local seconds timeout interval | ||
interval=2 | ||
seconds=0 | ||
timeout=$1 | ||
shift | ||
while eval "$*"; do | ||
seconds=$(( seconds + interval )) | ||
sleep $interval | ||
echo -n '.' | ||
[[ $seconds -gt $timeout ]] && echo "Time out of ${timeout} exceeded" && return 1 | ||
done | ||
if [[ "$seconds" != '0' ]]; then | ||
echo '' | ||
fi | ||
return 0 | ||
} | ||
|
||
has_csv_succeeded() { | ||
local ns=$1 | ||
local subscription=$2 | ||
local csv | ||
csv=$(oc get subscriptions.operators.coreos.com "${subscription}" -n "${ns}" -o=custom-columns=CURRENT_CSV:.status.currentCSV --no-headers=true) | ||
if [ x"$csv" != "x" ] && [ x"$csv" != x"<none>" ] | ||
then | ||
phase=$(oc get clusterserviceversions.operators.coreos.com -n "${ns}" "${csv}" -o=custom-columns=PHASE:.status.phase --no-headers=true) | ||
if [ "$phase" = "Succeeded" ] | ||
then | ||
return 0 | ||
fi | ||
fi | ||
|
||
return 1 | ||
} | ||
|
||
function create_devconfig() { | ||
dc_name="dc-internal-registry" | ||
dc=$(oc get DeviceConfig $dc_name -n openshift-amd-gpu -oname --ignore-not-found) | ||
if [[ -n $dc ]]; | ||
then | ||
echo "AMD DeviceConfig $dc_name already exists". Skipping creation | ||
else | ||
echo "Creating AMD DeviceConfig..." | ||
oc create -f - <<EOF | ||
kind: DeviceConfig | ||
apiVersion: amd.io/v1alpha1 | ||
metadata: | ||
name: $dc_name | ||
namespace: openshift-amd-gpu | ||
EOF | ||
fi | ||
} | ||
|
||
|
||
function wait_until_pod_is_created() { | ||
label=$1 | ||
namespace=$2 | ||
timeout=$3 | ||
start_time=$(date +%s) | ||
while [ $(($(date +%s) - start_time)) -lt $timeout ]; do | ||
podName=$(oc get pods -n $2 -l $1 -oname) | ||
if [[ -n $podName ]]; | ||
then { | ||
echo Pod $podName found! | ||
return 0 | ||
} else { | ||
echo "waiting for pod with label $label" | ||
sleep 2 | ||
} | ||
fi | ||
done | ||
echo "Timeout exceeded, pod with label $label not found" | ||
return 1 | ||
} | ||
|
||
function machineconfig_updates { | ||
# There should be only "True" and there should be at least one | ||
[ True = "$(oc get machineconfigpool --no-headers=true '-o=custom-columns=UPDATED:.status.conditions[?(@.type=="Updated")].status' | uniq)" ] | ||
} | ||
|
||
function monitor_logs() { | ||
local pod_name=$1 | ||
local ns=$2 | ||
local c_name=$3 | ||
shift 3 | ||
local search_text=$(printf "%q " "$@") | ||
echo "Monitoring logs for pod $pod_name..." | ||
# Use 'kubectl logs' command to fetch logs continuously | ||
oc logs "$pod_name" -c "$c_name" -n "$ns" | while read -r line; do | ||
if [[ $line == *"$search_text"* ]]; then | ||
echo "Found \"$search_text\" in pod logs: $line" | ||
fi | ||
done | ||
} | ||
|
||
function wait_until_driver_image_is_built() { | ||
startup_timeout=$1 | ||
build_timeout=$2 | ||
name=$(oc get pod -n openshift-amd-gpu -l openshift.io/build.name -oname) | ||
echo Builder pod name: $name | ||
oc wait --timeout="${startup_timeout}s" --for=condition=ready pod -n openshift-amd-gpu -l openshift.io/build.name | ||
echo "Wait for the image build to finish" | ||
oc wait --timeout="${build_timeout}s" --for=delete pod -n openshift-amd-gpu -l openshift.io/build.name | ||
echo "Checking the image stream got created" | ||
image=$(oc get is amd_gpu_kmm_modules -n openshift-amd-gpu -oname) | ||
if [[ $? -eq 0 ]]; | ||
then | ||
echo ".Image Stream $image found!" | ||
else | ||
echo ".Image Stream amd_gpu_kmm_modules not found. Check the cluster" | ||
exit 1 | ||
fi | ||
} | ||
|
||
function create_acceleratorprofile() { | ||
echo "Creating an Accelerator Profile for Dashboard" | ||
oc apply -f - <<EOF | ||
apiVersion: dashboard.opendatahub.io/v1 | ||
kind: AcceleratorProfile | ||
metadata: | ||
name: ods-ci-amd-gpu | ||
namespace: redhat-ods-applications | ||
spec: | ||
displayName: AMD GPU | ||
enabled: true | ||
identifier: amd.com/gpu | ||
tolerations: | ||
- effect: NoSchedule | ||
key: amd.com/gpu | ||
operator: Exists | ||
EOF | ||
if [ $? -eq 0 ]; then | ||
echo "Verifying that an AcceleratorProfiles resource was created in redhat-ods-applications" | ||
oc describe AcceleratorProfiles -n redhat-ods-applications | ||
fi | ||
} | ||
|
||
check_registry | ||
status=$? | ||
|
||
# Blacklist the inbox drivers with a MachineConfig if the registry check was successful | ||
if [ $status -eq 0 ]; then | ||
oc apply -f "$GPU_INSTALL_DIR/blacklist_driver.yaml" | ||
else | ||
return 1 | ||
fi | ||
|
||
sleep 120 | ||
wait_while 1800 ! machineconfig_updates | ||
|
||
echo "Installing NFD operator" | ||
oc apply -f "$GPU_INSTALL_DIR/../nfd_operator.yaml" | ||
wait_while 360 ! has_csv_succeeded openshift-nfd nfd | ||
oc apply -f "$GPU_INSTALL_DIR/../nfd_deploy.yaml" | ||
echo "Installing KMM operator" | ||
oc apply -f "$GPU_INSTALL_DIR/kmm_operator_install.yaml" | ||
wait_while 360 ! has_csv_succeeded openshift-kmm kernel-module-management | ||
echo "Installing AMD operator" | ||
oc apply -f "$GPU_INSTALL_DIR/amd_gpu_install.yaml" | ||
wait_while 360 ! has_csv_succeeded openshift-amd-gpu amd-gpu-operator | ||
create_devconfig | ||
image=$(oc get is amd_gpu_kmm_modules -n openshift-amd-gpu -oname --ignore-not-found) | ||
if [[ -n $image ]]; | ||
then | ||
echo ".Image Stream amd_gpu_kmm_modules alredy present! Skipping waiting for builder pod"; | ||
else | ||
wait_until_pod_is_created openshift.io/build.name openshift-amd-gpu 180 | ||
wait_until_driver_image_is_built 60 1200 | ||
fi | ||
echo "Configuration of AMD GPU node and Operators completed" | ||
# the message appears in the logs, but the pod may get delete before our code next iteration checks the logs once again, | ||
# hence it'd fails to reach the pod. It happened to me | ||
# wait_while 1200 monitor_logs "$name" openshift-amd-gpu docker-build "Successfully pushed image-registry.openshift-image-registry.svc:5000/openshift-amd-gpu" | ||
create_acceleratorprofile |
17 changes: 17 additions & 0 deletions
17
ods_ci/tasks/Resources/Provisioning/GPU/AMD/blacklist_driver.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
apiVersion: machineconfiguration.openshift.io/v1 | ||
kind: MachineConfig | ||
metadata: | ||
labels: | ||
machineconfiguration.openshift.io/role: worker | ||
name: amdgpu-module-blacklist | ||
spec: | ||
config: | ||
ignition: | ||
version: 3.2.0 | ||
storage: | ||
files: | ||
- path: "/etc/modprobe.d/amdgpu-blacklist.conf" | ||
mode: 420 | ||
overwrite: true | ||
contents: | ||
source: "data:text/plain;base64,YmxhY2tsaXN0IGFtZGdwdQo=" |
26 changes: 26 additions & 0 deletions
26
ods_ci/tasks/Resources/Provisioning/GPU/AMD/kmm_operator_install.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
apiVersion: v1 | ||
kind: Namespace | ||
metadata: | ||
name: openshift-kmm | ||
|
||
--- | ||
|
||
apiVersion: operators.coreos.com/v1 | ||
kind: OperatorGroup | ||
metadata: | ||
name: openshift-kmm-operator-group | ||
namespace: openshift-kmm | ||
spec: {} | ||
|
||
--- | ||
apiVersion: operators.coreos.com/v1alpha1 | ||
kind: Subscription | ||
metadata: | ||
name: kernel-module-management | ||
namespace: openshift-kmm | ||
spec: | ||
channel: stable | ||
installPlanApproval: Automatic | ||
name: kernel-module-management | ||
source: redhat-operators | ||
sourceNamespace: openshift-marketplace |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
#!/bin/bash | ||
set -e | ||
|
||
# Optional params | ||
INSTANCE_TYPE=${1:-"g4dn.xlarge"} | ||
PROVIDER=${2:-"AWS"} | ||
KUSTOMIZE_PATH="$PWD/tasks/Resources/Provisioning/Hive/GPU" | ||
MACHINESET_PATH="$KUSTOMIZE_PATH/base/source-machineset.yaml" | ||
|
||
# Check if existing machineset GPU already exists | ||
EXISTING_GPU_MACHINESET="$(oc get machineset -n openshift-machine-api -o jsonpath="{.items[?(@.metadata.annotations['machine\.openshift\.io/GPU']>'0')].metadata.name}")" | ||
if [[ -n "$EXISTING_GPU_MACHINESET" ]] ; then | ||
echo "Machine-set for GPU already exists" | ||
oc get machinesets -A --show-labels | ||
exit 0 | ||
fi | ||
|
||
# Select the first machineset as a template for the GPU machineset | ||
SOURCE_MACHINESET=$(oc get machineset -n openshift-machine-api -o name | head -n1) | ||
oc get -o yaml -n openshift-machine-api $SOURCE_MACHINESET > $MACHINESET_PATH | ||
|
||
# rename machine set in the template file | ||
OLD_MACHINESET_NAME=$(yq '.metadata.name' $MACHINESET_PATH ) | ||
NEW_MACHINESET_NAME=${OLD_MACHINESET_NAME/worker/gpu} | ||
sed -i'' -e "s/$OLD_MACHINESET_NAME/$NEW_MACHINESET_NAME/g" $MACHINESET_PATH | ||
|
||
# set the desired node flavor in the kustomize overlay | ||
sed -i'' -e "s/INSTANCE_TYPE/$INSTANCE_TYPE/g" $KUSTOMIZE_PATH/overlays/$PROVIDER/gpu.yaml | ||
|
||
# create the new MachineSet using kustomize | ||
oc apply --kustomize $KUSTOMIZE_PATH/overlays/$PROVIDER | ||
|
||
# Add GPU label to the new machine-set | ||
oc patch machinesets -n openshift-machine-api "$NEW_MACHINESET_NAME" -p '{"metadata":{"labels":{"gpu-machineset":"true"}}}' --type=merge |
42 changes: 0 additions & 42 deletions
42
ods_ci/tasks/Resources/Provisioning/Hive/AWS/provision-gpu.sh
This file was deleted.
Oops, something went wrong.
4 changes: 4 additions & 0 deletions
4
ods_ci/tasks/Resources/Provisioning/Hive/GPU/base/kustomization.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
apiVersion: kustomize.config.k8s.io/v1beta1 | ||
kind: Kustomization | ||
resources: | ||
- source-machineset.yaml |
1 change: 1 addition & 0 deletions
1
ods_ci/tasks/Resources/Provisioning/Hive/GPU/base/source-machineset.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# PLACEHOLDER - the content is dynamically generated by ods_ci/tasks/Resources/Provisioning/GPU/provision-gpu.sh |
3 changes: 3 additions & 0 deletions
3
ods_ci/tasks/Resources/Provisioning/Hive/GPU/overlays/AWS/gpu.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
- op: replace | ||
path: /spec/template/spec/providerSpec/value/instanceType | ||
value: INSTANCE_TYPE |
10 changes: 10 additions & 0 deletions
10
ods_ci/tasks/Resources/Provisioning/Hive/GPU/overlays/AWS/kustomization.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
apiVersion: kustomize.config.k8s.io/v1beta1 | ||
kind: Kustomization | ||
metadata: | ||
name: add-gpu | ||
resources: | ||
- ../../base/ | ||
patches: | ||
- path: gpu.yaml | ||
target: | ||
kind: MachineSet |
Oops, something went wrong.