-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
gpu_operator_extend_metrics: new role to deploy extra DCGM metrics
- Loading branch information
Showing
10 changed files
with
439 additions
and
0 deletions.
There are no files selected for viewing
20 changes: 20 additions & 0 deletions
20
roles/gpu_operator/gpu_operator_extend_metrics/defaults/main/config.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
# Auto-generated file, do not edit manually ... | ||
# Toolbox generate command: repo generate_ansible_default_settings | ||
# Source component: GPU_Operator.extend_metrics | ||
|
||
# Parameters | ||
# if True, include the default DCGM metrics in the custom config | ||
gpu_operator_extend_metrics_include_defaults: true | ||
|
||
# if True, include well-known interesting DCGM metrics in the custom config | ||
gpu_operator_extend_metrics_include_well_known: false | ||
|
||
# namespace in which the GPU Operator is deployed | ||
gpu_operator_extend_metrics_namespace: nvidia-gpu-operator | ||
|
||
# name of the ConfigMap where the configuration will be stored | ||
gpu_operator_extend_metrics_configmap_name: metrics-config | ||
|
||
# if not None, a [{name,type,description}*] list of dictionnaries with the extra metrics to include in the custom config | ||
# Type: List | ||
gpu_operator_extend_metrics_extra_metrics: null |
Empty file.
90 changes: 90 additions & 0 deletions
90
roles/gpu_operator/gpu_operator_extend_metrics/files/dcgm-metrics.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
# Format | ||
# If line starts with a '#' it is considered a comment | ||
# DCGM FIELD, Prometheus metric type, help message | ||
|
||
# Clocks | ||
DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). | ||
DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz). | ||
|
||
# Temperature | ||
DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C). | ||
DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C). | ||
|
||
# Power | ||
DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W). | ||
DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ). | ||
|
||
# PCIE | ||
# DCGM_FI_DEV_PCIE_TX_THROUGHPUT, counter, Total number of bytes transmitted through PCIe TX (in KB) via NVML. | ||
# DCGM_FI_DEV_PCIE_RX_THROUGHPUT, counter, Total number of bytes received through PCIe RX (in KB) via NVML. | ||
DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries. | ||
|
||
# Utilization (the sample period varies depending on the product) | ||
DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %). | ||
DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %). | ||
DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %). | ||
DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %). | ||
|
||
# Errors and violations | ||
DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered. | ||
# DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us). | ||
# DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us). | ||
# DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us). | ||
# DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us). | ||
# DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us). | ||
# DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us). | ||
|
||
# Memory usage | ||
DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB). | ||
DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB). | ||
|
||
# ECC | ||
# DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors. | ||
# DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors. | ||
# DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors. | ||
# DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors. | ||
|
||
# Retired pages | ||
# DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors. | ||
# DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors. | ||
# DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement. | ||
|
||
# NVLink | ||
# DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors. | ||
# DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. | ||
# DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. | ||
# DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. | ||
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes. | ||
# DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, counter, The number of bytes of active NVLink rx or tx data including both header and payload. | ||
|
||
# VGPU License status | ||
DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status | ||
|
||
# Remapped rows | ||
DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors | ||
DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors | ||
DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed | ||
|
||
# Static configuration information. These appear as labels on the other metrics | ||
DCGM_FI_DRIVER_VERSION, label, Driver Version | ||
# DCGM_FI_NVML_VERSION, label, NVML Version | ||
# DCGM_FI_DEV_BRAND, label, Device Brand | ||
# DCGM_FI_DEV_SERIAL, label, Device Serial Number | ||
# DCGM_FI_DEV_OEM_INFOROM_VER, label, OEM inforom version | ||
# DCGM_FI_DEV_ECC_INFOROM_VER, label, ECC inforom version | ||
# DCGM_FI_DEV_POWER_INFOROM_VER, label, Power management object inforom version | ||
# DCGM_FI_DEV_INFOROM_IMAGE_VER, label, Inforom image version | ||
# DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device | ||
|
||
# DCP metrics | ||
DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active (in %). | ||
# DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned (in %). | ||
# DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM (in %). | ||
DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active (in %). | ||
DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data (in %). | ||
# DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active (in %). | ||
# DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active (in %). | ||
# DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active (in %). | ||
DCGM_FI_PROF_PCIE_TX_BYTES, gauge, The rate of data transmitted over the PCIe bus - including both protocol headers and data payloads - in bytes per second. | ||
DCGM_FI_PROF_PCIE_RX_BYTES, gauge, The rate of data received over the PCIe bus - including both protocol headers and data payloads - in bytes per second. | ||
|
164 changes: 164 additions & 0 deletions
164
roles/gpu_operator/gpu_operator_extend_metrics/files/gpu.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,164 @@ | ||
{ | ||
"apiVersion": "nvidia.com/v1", | ||
"kind": "ClusterPolicy", | ||
"metadata": { | ||
"creationTimestamp": "2023-10-24T19:28:52Z", | ||
"generation": 17, | ||
"name": "gpu-cluster-policy", | ||
"resourceVersion": "66537431", | ||
"uid": "444eb444-6cca-4df1-8390-1eb0d29666dd" | ||
}, | ||
"spec": { | ||
"cdi": { | ||
"default": false, | ||
"enabled": false | ||
}, | ||
"daemonsets": { | ||
"rollingUpdate": { | ||
"maxUnavailable": "1" | ||
}, | ||
"updateStrategy": "RollingUpdate" | ||
}, | ||
"dcgm": { | ||
"enabled": true | ||
}, | ||
"dcgmExporter": { | ||
"config": { | ||
"name": "dcgm-metrics-list" | ||
}, | ||
"enabled": true, | ||
"serviceMonitor": { | ||
"enabled": true | ||
} | ||
}, | ||
"devicePlugin": { | ||
"config": { | ||
"default": "any", | ||
"name": "time-slicing-config-all" | ||
}, | ||
"enabled": true | ||
}, | ||
"driver": { | ||
"certConfig": { | ||
"name": "" | ||
}, | ||
"enabled": true, | ||
"kernelModuleConfig": { | ||
"name": "" | ||
}, | ||
"licensingConfig": { | ||
"configMapName": "", | ||
"nlsEnabled": true | ||
}, | ||
"repoConfig": { | ||
"configMapName": "" | ||
}, | ||
"upgradePolicy": { | ||
"autoUpgrade": true, | ||
"drain": { | ||
"deleteEmptyDir": false, | ||
"enable": false, | ||
"force": false, | ||
"timeoutSeconds": 300 | ||
}, | ||
"maxParallelUpgrades": 1, | ||
"maxUnavailable": "25%", | ||
"podDeletion": { | ||
"deleteEmptyDir": false, | ||
"force": false, | ||
"timeoutSeconds": 300 | ||
}, | ||
"waitForCompletion": { | ||
"timeoutSeconds": 0 | ||
} | ||
}, | ||
"useNvidiaDriverCRD": false, | ||
"virtualTopology": { | ||
"config": "" | ||
} | ||
}, | ||
"gds": { | ||
"enabled": false | ||
}, | ||
"gfd": { | ||
"enabled": true | ||
}, | ||
"kataManager": { | ||
"config": { | ||
"artifactsDir": "/opt/nvidia-gpu-operator/artifacts/runtimeclasses" | ||
} | ||
}, | ||
"mig": { | ||
"strategy": "single" | ||
}, | ||
"migManager": { | ||
"config": { | ||
"default": "all-disabled", | ||
"name": "default-mig-parted-config" | ||
}, | ||
"enabled": true | ||
}, | ||
"nodeStatusExporter": { | ||
"enabled": true | ||
}, | ||
"operator": { | ||
"defaultRuntime": "crio", | ||
"initContainer": {}, | ||
"runtimeClass": "nvidia", | ||
"use_ocp_driver_toolkit": true | ||
}, | ||
"sandboxDevicePlugin": { | ||
"enabled": true | ||
}, | ||
"sandboxWorkloads": { | ||
"defaultWorkload": "container", | ||
"enabled": false | ||
}, | ||
"toolkit": { | ||
"enabled": true, | ||
"installDir": "/usr/local/nvidia" | ||
}, | ||
"validator": { | ||
"plugin": { | ||
"env": [ | ||
{ | ||
"name": "WITH_WORKLOAD", | ||
"value": "false" | ||
} | ||
] | ||
} | ||
}, | ||
"vfioManager": { | ||
"enabled": true | ||
}, | ||
"vgpuDeviceManager": { | ||
"config": { | ||
"default": "default" | ||
}, | ||
"enabled": true | ||
}, | ||
"vgpuManager": { | ||
"enabled": false | ||
} | ||
}, | ||
"status": { | ||
"conditions": [ | ||
{ | ||
"lastTransitionTime": "2023-11-07T13:59:43Z", | ||
"message": "ClusterPolicy is ready as all resources have been successfully reconciled", | ||
"reason": "Reconciled", | ||
"status": "True", | ||
"type": "Ready" | ||
}, | ||
{ | ||
"lastTransitionTime": "2023-11-07T13:59:43Z", | ||
"message": "", | ||
"reason": "Ready", | ||
"status": "False", | ||
"type": "Error" | ||
} | ||
], | ||
"namespace": "nvidia-gpu-operator", | ||
"state": "ready" | ||
} | ||
} |
8 changes: 8 additions & 0 deletions
8
roles/gpu_operator/gpu_operator_extend_metrics/files/well-known.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
# DCP metrics | extra metrics disabled in the default DCGM metrics set | ||
|
||
DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned (in %). | ||
DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM (in %). | ||
|
||
DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active (in %). | ||
DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active (in %). | ||
DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active (in %). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
--- | ||
dependencies: | ||
- role: check_deps |
Oops, something went wrong.