-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
DGX-A100_GPU-Operator_testing: cluster_setup.md: new file
- Loading branch information
Showing
1 changed file
with
165 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,165 @@ | ||
# Prepare the cluster operators | ||
|
||
1. Install NFD Operator | ||
- don't forget to create the NodeFeatureDiscoveries | ||
|
||
2. Install the NVIDIA GPU Operator | ||
- don't forgot to create the ClusterPolicy | ||
|
||
2. Install the Local Storage Operator | ||
- see next step for the creation of a LocalVolume | ||
|
||
# Setup the storage | ||
|
||
1. Create the local disk storage class | ||
|
||
``` | ||
DISK_DEV=/dev/nvme2n1 | ||
STORAGE_CLASS_NAME=local-sc-dgx | ||
NODE_NAME=dgxa100 | ||
cat <<EOF | oc apply -f- | ||
apiVersion: local.storage.openshift.io/v1 | ||
kind: LocalVolume | ||
metadata: | ||
name: local-disks | ||
namespace: openshift-local-storage | ||
spec: | ||
nodeSelector: | ||
nodeSelectorTerms: | ||
- matchExpressions: | ||
- key: kubernetes.io/hostname | ||
operator: In | ||
values: | ||
- $NODE_NAME | ||
storageClassDevices: | ||
- storageClassName: $STORAGE_CLASS_NAME | ||
volumeMode: Filesystem | ||
fsType: xfs | ||
devicePaths: | ||
- $DISK_DEV | ||
EOF | ||
``` | ||
|
||
# Enable user-workload monitoring | ||
|
||
``` | ||
cat <<EOF | oc apply -f- | ||
apiVersion: v1 | ||
kind: ConfigMap | ||
metadata: | ||
name: cluster-monitoring-config | ||
namespace: openshift-monitoring | ||
data: | ||
config.yaml: | | ||
enableUserWorkload: true | ||
EOF | ||
``` | ||
|
||
# Prepare the dataset and image for the benchmark | ||
|
||
1. Create the dataset PVC | ||
|
||
``` | ||
PVC_NAME=benchmarking-coco-dataset | ||
NAMESPACE=default | ||
STORAGE_CLASS_NAME=local-sc-dgx | ||
cat <<EOF | oc apply -f- | ||
apiVersion: v1 | ||
kind: PersistentVolumeClaim | ||
metadata: | ||
name: $PVC_NAME | ||
namespace: $NAMESPACE | ||
spec: | ||
accessModes: | ||
- ReadWriteOnce | ||
resources: | ||
requests: | ||
storage: 80Gi | ||
storageClassName: $STORAGE_CLASS_NAME | ||
EOF | ||
``` | ||
|
||
2. Populate the PVC with the coco-dataset | ||
|
||
``` | ||
NODE_NAME=dgxa100 | ||
NAMESPACE=default | ||
PVC_NAME=benchmarking-coco-dataset | ||
./run_toolbox.py benchmarking download_coco_dataset $NODE_NAME \ | ||
--namespace $NAMESPACE \ | ||
--pvc-name $PVC_NAME | ||
``` | ||
|
||
3. Build the MLPerf NVIDIA PyToch SSD image | ||
|
||
``` | ||
oc patch configs.imageregistry.operator.openshift.io cluster --type merge --patch '{"spec":{"managementState":"Managed"}}' | ||
oc patch configs.imageregistry.operator.openshift.io cluster --type merge --patch '{"spec":{"storage":{"emptyDir":{}}}}' | ||
``` | ||
|
||
``` | ||
NAMESPACE=default | ||
cat <<EOF | oc apply -f- | ||
kind: ImageStream | ||
apiVersion: image.openshift.io/v1 | ||
metadata: | ||
name: mlperf | ||
namespace: $NAMESPACE | ||
labels: | ||
app: mlperf | ||
spec: {} | ||
EOF | ||
``` | ||
|
||
``` | ||
NAMESPACE=default | ||
cat <<EOF | oc apply -f- | ||
apiVersion: build.openshift.io/v1 | ||
kind: BuildConfig | ||
metadata: | ||
labels: | ||
app: mlperf | ||
name: mlperf0.7 | ||
namespace: $NAMESPACE | ||
spec: | ||
output: | ||
to: | ||
kind: ImageStreamTag | ||
name: mlperf:ssd_0.7 | ||
namespace: default | ||
resources: {} | ||
source: | ||
type: Git | ||
git: | ||
uri: "https://github.com/kpouget/training_results_v0.7.git" | ||
ref: "master" | ||
contextDir: NVIDIA/benchmarks/ssd/implementations/pytorch | ||
triggers: | ||
- type: "ConfigChange" | ||
strategy: | ||
type: Docker | ||
dockerStrategy: | ||
dockerfilePath: Dockerfile | ||
from: | ||
kind: DockerImage | ||
name: nvcr.io/nvidia/pytorch:20.06-py3 | ||
EOF | ||
``` | ||
|
||
# Allow extra MIG modes | ||
|
||
``` | ||
oc apply -f mig-custom.yaml | ||
set "/spec/migManager/config/name" = custom-mig-config | ||
``` | ||
|
||
# Enable MIG metrics | ||
|
||
``` | ||
oc patch clusterpolicy/gpu-cluster-policy --type='json' -p='[{"op": "replace", "path": "/spec/dcgmExporter/repository", "value": "nvcr.io/nvidia/k8s"}]' | ||
oc patch clusterpolicy/gpu-cluster-policy --type='json' -p='[{"op": "replace", "path": "/spec/dcgmExporter/image", "value": "dcgm-exporter"}]' | ||
oc patch clusterpolicy/gpu-cluster-policy --type='json' -p='[{"op": "replace", "path": "/spec/dcgmExporter/version", "value": "2.3.1-2.6.1-ubi8"}]' | ||
``` |