Skip to content

Commit

Permalink
DGX-A100_GPU-Operator_testing: cluster_setup.md: new file
Browse files Browse the repository at this point in the history
  • Loading branch information
kpouget committed Jun 14, 2022
1 parent 8580ce9 commit f04181b
Showing 1 changed file with 165 additions and 0 deletions.
165 changes: 165 additions & 0 deletions DGX-A100_GPU-Operator_testing/cluster_setup.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
# Prepare the cluster operators

1. Install NFD Operator
- don't forget to create the NodeFeatureDiscoveries

2. Install the NVIDIA GPU Operator
- don't forgot to create the ClusterPolicy

2. Install the Local Storage Operator
- see next step for the creation of a LocalVolume

# Setup the storage

1. Create the local disk storage class

```
DISK_DEV=/dev/nvme2n1
STORAGE_CLASS_NAME=local-sc-dgx
NODE_NAME=dgxa100
cat <<EOF | oc apply -f-
apiVersion: local.storage.openshift.io/v1
kind: LocalVolume
metadata:
name: local-disks
namespace: openshift-local-storage
spec:
nodeSelector:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- $NODE_NAME
storageClassDevices:
- storageClassName: $STORAGE_CLASS_NAME
volumeMode: Filesystem
fsType: xfs
devicePaths:
- $DISK_DEV
EOF
```

# Enable user-workload monitoring

```
cat <<EOF | oc apply -f-
apiVersion: v1
kind: ConfigMap
metadata:
name: cluster-monitoring-config
namespace: openshift-monitoring
data:
config.yaml: |
enableUserWorkload: true
EOF
```

# Prepare the dataset and image for the benchmark

1. Create the dataset PVC

```
PVC_NAME=benchmarking-coco-dataset
NAMESPACE=default
STORAGE_CLASS_NAME=local-sc-dgx
cat <<EOF | oc apply -f-
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: $PVC_NAME
namespace: $NAMESPACE
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 80Gi
storageClassName: $STORAGE_CLASS_NAME
EOF
```

2. Populate the PVC with the coco-dataset

```
NODE_NAME=dgxa100
NAMESPACE=default
PVC_NAME=benchmarking-coco-dataset
./run_toolbox.py benchmarking download_coco_dataset $NODE_NAME \
--namespace $NAMESPACE \
--pvc-name $PVC_NAME
```

3. Build the MLPerf NVIDIA PyToch SSD image

```
oc patch configs.imageregistry.operator.openshift.io cluster --type merge --patch '{"spec":{"managementState":"Managed"}}'
oc patch configs.imageregistry.operator.openshift.io cluster --type merge --patch '{"spec":{"storage":{"emptyDir":{}}}}'
```

```
NAMESPACE=default
cat <<EOF | oc apply -f-
kind: ImageStream
apiVersion: image.openshift.io/v1
metadata:
name: mlperf
namespace: $NAMESPACE
labels:
app: mlperf
spec: {}
EOF
```

```
NAMESPACE=default
cat <<EOF | oc apply -f-
apiVersion: build.openshift.io/v1
kind: BuildConfig
metadata:
labels:
app: mlperf
name: mlperf0.7
namespace: $NAMESPACE
spec:
output:
to:
kind: ImageStreamTag
name: mlperf:ssd_0.7
namespace: default
resources: {}
source:
type: Git
git:
uri: "https://github.com/kpouget/training_results_v0.7.git"
ref: "master"
contextDir: NVIDIA/benchmarks/ssd/implementations/pytorch
triggers:
- type: "ConfigChange"
strategy:
type: Docker
dockerStrategy:
dockerfilePath: Dockerfile
from:
kind: DockerImage
name: nvcr.io/nvidia/pytorch:20.06-py3
EOF
```

# Allow extra MIG modes

```
oc apply -f mig-custom.yaml
set "/spec/migManager/config/name" = custom-mig-config
```

# Enable MIG metrics

```
oc patch clusterpolicy/gpu-cluster-policy --type='json' -p='[{"op": "replace", "path": "/spec/dcgmExporter/repository", "value": "nvcr.io/nvidia/k8s"}]'
oc patch clusterpolicy/gpu-cluster-policy --type='json' -p='[{"op": "replace", "path": "/spec/dcgmExporter/image", "value": "dcgm-exporter"}]'
oc patch clusterpolicy/gpu-cluster-policy --type='json' -p='[{"op": "replace", "path": "/spec/dcgmExporter/version", "value": "2.3.1-2.6.1-ubi8"}]'
```

0 comments on commit f04181b

Please sign in to comment.