Skip to content

Commit e168a76

Browse files
authored
Watsonx-serving: test on the DGX (#94)
2 parents 7d511b6 + 3eceb35 commit e168a76

File tree

17 files changed

+480
-9
lines changed

17 files changed

+480
-9
lines changed

roles/cluster/cluster_deploy_operator/tasks/main.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,12 @@
283283
when: operator_csv_phase.stdout != "Succeeded"
284284

285285
rescue:
286+
- name: Store the CSV that have been installed
287+
shell:
288+
oc get ClusterServiceVersion
289+
-n "{{ cluster_deploy_operator_namespace }}"
290+
> {{ artifact_extra_logs_dir }}/all_csv.status
291+
286292
- name: Capture the Catalog Operator logs (debug)
287293
shell:
288294
oc logs deployment.apps/catalog-operator
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# Auto-generated file, do not edit manually ...
2+
# Toolbox generate command: repo generate_ansible_default_settings
3+
# Source component: GPU_Operator.extend_metrics
4+
5+
# Parameters
6+
# if True, include the default DCGM metrics in the custom config
7+
gpu_operator_extend_metrics_include_defaults: true
8+
9+
# if True, include well-known interesting DCGM metrics in the custom config
10+
gpu_operator_extend_metrics_include_well_known: false
11+
12+
# namespace in which the GPU Operator is deployed
13+
gpu_operator_extend_metrics_namespace: nvidia-gpu-operator
14+
15+
# name of the ConfigMap where the configuration will be stored
16+
gpu_operator_extend_metrics_configmap_name: metrics-config
17+
18+
# if not None, a [{name,type,description}*] list of dictionnaries with the extra metrics to include in the custom config
19+
# Type: List
20+
gpu_operator_extend_metrics_extra_metrics: null

roles/gpu_operator/gpu_operator_extend_metrics/files/.keep

Whitespace-only changes.
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
# Format
2+
# If line starts with a '#' it is considered a comment
3+
# DCGM FIELD, Prometheus metric type, help message
4+
5+
# Clocks
6+
DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz).
7+
DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz).
8+
9+
# Temperature
10+
DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C).
11+
DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C).
12+
13+
# Power
14+
DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W).
15+
DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ).
16+
17+
# PCIE
18+
# DCGM_FI_DEV_PCIE_TX_THROUGHPUT, counter, Total number of bytes transmitted through PCIe TX (in KB) via NVML.
19+
# DCGM_FI_DEV_PCIE_RX_THROUGHPUT, counter, Total number of bytes received through PCIe RX (in KB) via NVML.
20+
DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries.
21+
22+
# Utilization (the sample period varies depending on the product)
23+
DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %).
24+
DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %).
25+
DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %).
26+
DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %).
27+
28+
# Errors and violations
29+
DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered.
30+
# DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us).
31+
# DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us).
32+
# DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us).
33+
# DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us).
34+
# DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us).
35+
# DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us).
36+
37+
# Memory usage
38+
DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB).
39+
DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB).
40+
41+
# ECC
42+
# DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors.
43+
# DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors.
44+
# DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors.
45+
# DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors.
46+
47+
# Retired pages
48+
# DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors.
49+
# DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors.
50+
# DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement.
51+
52+
# NVLink
53+
# DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors.
54+
# DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors.
55+
# DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries.
56+
# DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors.
57+
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes.
58+
# DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, counter, The number of bytes of active NVLink rx or tx data including both header and payload.
59+
60+
# VGPU License status
61+
DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status
62+
63+
# Remapped rows
64+
DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors
65+
DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors
66+
DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed
67+
68+
# Static configuration information. These appear as labels on the other metrics
69+
DCGM_FI_DRIVER_VERSION, label, Driver Version
70+
# DCGM_FI_NVML_VERSION, label, NVML Version
71+
# DCGM_FI_DEV_BRAND, label, Device Brand
72+
# DCGM_FI_DEV_SERIAL, label, Device Serial Number
73+
# DCGM_FI_DEV_OEM_INFOROM_VER, label, OEM inforom version
74+
# DCGM_FI_DEV_ECC_INFOROM_VER, label, ECC inforom version
75+
# DCGM_FI_DEV_POWER_INFOROM_VER, label, Power management object inforom version
76+
# DCGM_FI_DEV_INFOROM_IMAGE_VER, label, Inforom image version
77+
# DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device
78+
79+
# DCP metrics
80+
DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active (in %).
81+
# DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned (in %).
82+
# DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM (in %).
83+
DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active (in %).
84+
DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data (in %).
85+
# DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active (in %).
86+
# DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active (in %).
87+
# DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active (in %).
88+
DCGM_FI_PROF_PCIE_TX_BYTES, gauge, The rate of data transmitted over the PCIe bus - including both protocol headers and data payloads - in bytes per second.
89+
DCGM_FI_PROF_PCIE_RX_BYTES, gauge, The rate of data received over the PCIe bus - including both protocol headers and data payloads - in bytes per second.
90+
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
{
2+
"apiVersion": "nvidia.com/v1",
3+
"kind": "ClusterPolicy",
4+
"metadata": {
5+
"creationTimestamp": "2023-10-24T19:28:52Z",
6+
"generation": 17,
7+
"name": "gpu-cluster-policy",
8+
"resourceVersion": "66537431",
9+
"uid": "444eb444-6cca-4df1-8390-1eb0d29666dd"
10+
},
11+
"spec": {
12+
"cdi": {
13+
"default": false,
14+
"enabled": false
15+
},
16+
"daemonsets": {
17+
"rollingUpdate": {
18+
"maxUnavailable": "1"
19+
},
20+
"updateStrategy": "RollingUpdate"
21+
},
22+
"dcgm": {
23+
"enabled": true
24+
},
25+
"dcgmExporter": {
26+
"config": {
27+
"name": "dcgm-metrics-list"
28+
},
29+
"enabled": true,
30+
"serviceMonitor": {
31+
"enabled": true
32+
}
33+
},
34+
"devicePlugin": {
35+
"config": {
36+
"default": "any",
37+
"name": "time-slicing-config-all"
38+
},
39+
"enabled": true
40+
},
41+
"driver": {
42+
"certConfig": {
43+
"name": ""
44+
},
45+
"enabled": true,
46+
"kernelModuleConfig": {
47+
"name": ""
48+
},
49+
"licensingConfig": {
50+
"configMapName": "",
51+
"nlsEnabled": true
52+
},
53+
"repoConfig": {
54+
"configMapName": ""
55+
},
56+
"upgradePolicy": {
57+
"autoUpgrade": true,
58+
"drain": {
59+
"deleteEmptyDir": false,
60+
"enable": false,
61+
"force": false,
62+
"timeoutSeconds": 300
63+
},
64+
"maxParallelUpgrades": 1,
65+
"maxUnavailable": "25%",
66+
"podDeletion": {
67+
"deleteEmptyDir": false,
68+
"force": false,
69+
"timeoutSeconds": 300
70+
},
71+
"waitForCompletion": {
72+
"timeoutSeconds": 0
73+
}
74+
},
75+
"useNvidiaDriverCRD": false,
76+
"virtualTopology": {
77+
"config": ""
78+
}
79+
},
80+
"gds": {
81+
"enabled": false
82+
},
83+
"gfd": {
84+
"enabled": true
85+
},
86+
"kataManager": {
87+
"config": {
88+
"artifactsDir": "/opt/nvidia-gpu-operator/artifacts/runtimeclasses"
89+
}
90+
},
91+
"mig": {
92+
"strategy": "single"
93+
},
94+
"migManager": {
95+
"config": {
96+
"default": "all-disabled",
97+
"name": "default-mig-parted-config"
98+
},
99+
"enabled": true
100+
},
101+
"nodeStatusExporter": {
102+
"enabled": true
103+
},
104+
"operator": {
105+
"defaultRuntime": "crio",
106+
"initContainer": {},
107+
"runtimeClass": "nvidia",
108+
"use_ocp_driver_toolkit": true
109+
},
110+
"sandboxDevicePlugin": {
111+
"enabled": true
112+
},
113+
"sandboxWorkloads": {
114+
"defaultWorkload": "container",
115+
"enabled": false
116+
},
117+
"toolkit": {
118+
"enabled": true,
119+
"installDir": "/usr/local/nvidia"
120+
},
121+
"validator": {
122+
"plugin": {
123+
"env": [
124+
{
125+
"name": "WITH_WORKLOAD",
126+
"value": "false"
127+
}
128+
]
129+
}
130+
},
131+
"vfioManager": {
132+
"enabled": true
133+
},
134+
"vgpuDeviceManager": {
135+
"config": {
136+
"default": "default"
137+
},
138+
"enabled": true
139+
},
140+
"vgpuManager": {
141+
"enabled": false
142+
}
143+
},
144+
"status": {
145+
"conditions": [
146+
{
147+
"lastTransitionTime": "2023-11-07T13:59:43Z",
148+
"message": "ClusterPolicy is ready as all resources have been successfully reconciled",
149+
"reason": "Reconciled",
150+
"status": "True",
151+
"type": "Ready"
152+
},
153+
{
154+
"lastTransitionTime": "2023-11-07T13:59:43Z",
155+
"message": "",
156+
"reason": "Ready",
157+
"status": "False",
158+
"type": "Error"
159+
}
160+
],
161+
"namespace": "nvidia-gpu-operator",
162+
"state": "ready"
163+
}
164+
}
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# DCP metrics | extra metrics disabled in the default DCGM metrics set
2+
3+
DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned (in %).
4+
DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM (in %).
5+
6+
DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active (in %).
7+
DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active (in %).
8+
DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active (in %).
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
---
2+
dependencies:
3+
- role: check_deps

0 commit comments

Comments
 (0)