Skip to content

Commit

Permalink
gpu_operator_extend_metrics: new role to deploy extra DCGM metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
kpouget committed Nov 9, 2023
1 parent e793a3d commit 7d4177a
Show file tree
Hide file tree
Showing 10 changed files with 439 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Auto-generated file, do not edit manually ...
# Toolbox generate command: repo generate_ansible_default_settings
# Source component: GPU_Operator.extend_metrics

# Parameters
# if True, include the default DCGM metrics in the custom config
gpu_operator_extend_metrics_include_defaults: true

# if True, include well-known interesting DCGM metrics in the custom config
gpu_operator_extend_metrics_include_well_known: false

# namespace in which the GPU Operator is deployed
gpu_operator_extend_metrics_namespace: nvidia-gpu-operator

# name of the ConfigMap where the configuration will be stored
gpu_operator_extend_metrics_configmap_name: metrics-config

# if not None, a [{name,type,description}*] list of dictionnaries with the extra metrics to include in the custom config
# Type: List
gpu_operator_extend_metrics_extra_metrics: null
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# Format
# If line starts with a '#' it is considered a comment
# DCGM FIELD, Prometheus metric type, help message

# Clocks
DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz).
DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz).

# Temperature
DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C).
DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C).

# Power
DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W).
DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ).

# PCIE
# DCGM_FI_DEV_PCIE_TX_THROUGHPUT, counter, Total number of bytes transmitted through PCIe TX (in KB) via NVML.
# DCGM_FI_DEV_PCIE_RX_THROUGHPUT, counter, Total number of bytes received through PCIe RX (in KB) via NVML.
DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries.

# Utilization (the sample period varies depending on the product)
DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %).
DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %).
DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %).
DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %).

# Errors and violations
DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered.
# DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us).
# DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us).
# DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us).
# DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us).
# DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us).
# DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us).

# Memory usage
DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB).
DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB).

# ECC
# DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors.
# DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors.
# DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors.
# DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors.

# Retired pages
# DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors.
# DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors.
# DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement.

# NVLink
# DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors.
# DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors.
# DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries.
# DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors.
DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes.
# DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, counter, The number of bytes of active NVLink rx or tx data including both header and payload.

# VGPU License status
DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status

# Remapped rows
DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors
DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors
DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed

# Static configuration information. These appear as labels on the other metrics
DCGM_FI_DRIVER_VERSION, label, Driver Version
# DCGM_FI_NVML_VERSION, label, NVML Version
# DCGM_FI_DEV_BRAND, label, Device Brand
# DCGM_FI_DEV_SERIAL, label, Device Serial Number
# DCGM_FI_DEV_OEM_INFOROM_VER, label, OEM inforom version
# DCGM_FI_DEV_ECC_INFOROM_VER, label, ECC inforom version
# DCGM_FI_DEV_POWER_INFOROM_VER, label, Power management object inforom version
# DCGM_FI_DEV_INFOROM_IMAGE_VER, label, Inforom image version
# DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device

# DCP metrics
DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active (in %).
# DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned (in %).
# DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM (in %).
DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active (in %).
DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data (in %).
# DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active (in %).
# DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active (in %).
# DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active (in %).
DCGM_FI_PROF_PCIE_TX_BYTES, gauge, The rate of data transmitted over the PCIe bus - including both protocol headers and data payloads - in bytes per second.
DCGM_FI_PROF_PCIE_RX_BYTES, gauge, The rate of data received over the PCIe bus - including both protocol headers and data payloads - in bytes per second.

164 changes: 164 additions & 0 deletions roles/gpu_operator/gpu_operator_extend_metrics/files/gpu.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
{
"apiVersion": "nvidia.com/v1",
"kind": "ClusterPolicy",
"metadata": {
"creationTimestamp": "2023-10-24T19:28:52Z",
"generation": 17,
"name": "gpu-cluster-policy",
"resourceVersion": "66537431",
"uid": "444eb444-6cca-4df1-8390-1eb0d29666dd"
},
"spec": {
"cdi": {
"default": false,
"enabled": false
},
"daemonsets": {
"rollingUpdate": {
"maxUnavailable": "1"
},
"updateStrategy": "RollingUpdate"
},
"dcgm": {
"enabled": true
},
"dcgmExporter": {
"config": {
"name": "dcgm-metrics-list"
},
"enabled": true,
"serviceMonitor": {
"enabled": true
}
},
"devicePlugin": {
"config": {
"default": "any",
"name": "time-slicing-config-all"
},
"enabled": true
},
"driver": {
"certConfig": {
"name": ""
},
"enabled": true,
"kernelModuleConfig": {
"name": ""
},
"licensingConfig": {
"configMapName": "",
"nlsEnabled": true
},
"repoConfig": {
"configMapName": ""
},
"upgradePolicy": {
"autoUpgrade": true,
"drain": {
"deleteEmptyDir": false,
"enable": false,
"force": false,
"timeoutSeconds": 300
},
"maxParallelUpgrades": 1,
"maxUnavailable": "25%",
"podDeletion": {
"deleteEmptyDir": false,
"force": false,
"timeoutSeconds": 300
},
"waitForCompletion": {
"timeoutSeconds": 0
}
},
"useNvidiaDriverCRD": false,
"virtualTopology": {
"config": ""
}
},
"gds": {
"enabled": false
},
"gfd": {
"enabled": true
},
"kataManager": {
"config": {
"artifactsDir": "/opt/nvidia-gpu-operator/artifacts/runtimeclasses"
}
},
"mig": {
"strategy": "single"
},
"migManager": {
"config": {
"default": "all-disabled",
"name": "default-mig-parted-config"
},
"enabled": true
},
"nodeStatusExporter": {
"enabled": true
},
"operator": {
"defaultRuntime": "crio",
"initContainer": {},
"runtimeClass": "nvidia",
"use_ocp_driver_toolkit": true
},
"sandboxDevicePlugin": {
"enabled": true
},
"sandboxWorkloads": {
"defaultWorkload": "container",
"enabled": false
},
"toolkit": {
"enabled": true,
"installDir": "/usr/local/nvidia"
},
"validator": {
"plugin": {
"env": [
{
"name": "WITH_WORKLOAD",
"value": "false"
}
]
}
},
"vfioManager": {
"enabled": true
},
"vgpuDeviceManager": {
"config": {
"default": "default"
},
"enabled": true
},
"vgpuManager": {
"enabled": false
}
},
"status": {
"conditions": [
{
"lastTransitionTime": "2023-11-07T13:59:43Z",
"message": "ClusterPolicy is ready as all resources have been successfully reconciled",
"reason": "Reconciled",
"status": "True",
"type": "Ready"
},
{
"lastTransitionTime": "2023-11-07T13:59:43Z",
"message": "",
"reason": "Ready",
"status": "False",
"type": "Error"
}
],
"namespace": "nvidia-gpu-operator",
"state": "ready"
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# DCP metrics | extra metrics disabled in the default DCGM metrics set

DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned (in %).
DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM (in %).

DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active (in %).
DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active (in %).
DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active (in %).
3 changes: 3 additions & 0 deletions roles/gpu_operator/gpu_operator_extend_metrics/meta/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
---
dependencies:
- role: check_deps
Loading

0 comments on commit 7d4177a

Please sign in to comment.