From 7d4177a87cc1b1c79e921e298c609dbf19ccf231 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 9 Nov 2023 11:39:31 +0100 Subject: [PATCH] gpu_operator_extend_metrics: new role to deploy extra DCGM metrics --- .../defaults/main/config.yml | 20 +++ .../gpu_operator_extend_metrics/files/.keep | 0 .../files/dcgm-metrics.csv | 90 ++++++++++ .../files/gpu.json | 164 ++++++++++++++++++ .../files/well-known.csv | 8 + .../gpu_operator_extend_metrics/meta/main.yml | 3 + .../tasks/main.yml | 113 ++++++++++++ .../templates/configmap.yaml.j2 | 16 ++ .../vars/main/resources.yml | 2 + toolbox/gpu_operator.py | 23 +++ 10 files changed, 439 insertions(+) create mode 100644 roles/gpu_operator/gpu_operator_extend_metrics/defaults/main/config.yml create mode 100644 roles/gpu_operator/gpu_operator_extend_metrics/files/.keep create mode 100644 roles/gpu_operator/gpu_operator_extend_metrics/files/dcgm-metrics.csv create mode 100644 roles/gpu_operator/gpu_operator_extend_metrics/files/gpu.json create mode 100644 roles/gpu_operator/gpu_operator_extend_metrics/files/well-known.csv create mode 100644 roles/gpu_operator/gpu_operator_extend_metrics/meta/main.yml create mode 100644 roles/gpu_operator/gpu_operator_extend_metrics/tasks/main.yml create mode 100644 roles/gpu_operator/gpu_operator_extend_metrics/templates/configmap.yaml.j2 create mode 100644 roles/gpu_operator/gpu_operator_extend_metrics/vars/main/resources.yml diff --git a/roles/gpu_operator/gpu_operator_extend_metrics/defaults/main/config.yml b/roles/gpu_operator/gpu_operator_extend_metrics/defaults/main/config.yml new file mode 100644 index 0000000000..9e41288d27 --- /dev/null +++ b/roles/gpu_operator/gpu_operator_extend_metrics/defaults/main/config.yml @@ -0,0 +1,20 @@ +# Auto-generated file, do not edit manually ... +# Toolbox generate command: repo generate_ansible_default_settings +# Source component: GPU_Operator.extend_metrics + +# Parameters +# if True, include the default DCGM metrics in the custom config +gpu_operator_extend_metrics_include_defaults: true + +# if True, include well-known interesting DCGM metrics in the custom config +gpu_operator_extend_metrics_include_well_known: false + +# namespace in which the GPU Operator is deployed +gpu_operator_extend_metrics_namespace: nvidia-gpu-operator + +# name of the ConfigMap where the configuration will be stored +gpu_operator_extend_metrics_configmap_name: metrics-config + +# if not None, a [{name,type,description}*] list of dictionnaries with the extra metrics to include in the custom config +# Type: List +gpu_operator_extend_metrics_extra_metrics: null diff --git a/roles/gpu_operator/gpu_operator_extend_metrics/files/.keep b/roles/gpu_operator/gpu_operator_extend_metrics/files/.keep new file mode 100644 index 0000000000..e69de29bb2 diff --git a/roles/gpu_operator/gpu_operator_extend_metrics/files/dcgm-metrics.csv b/roles/gpu_operator/gpu_operator_extend_metrics/files/dcgm-metrics.csv new file mode 100644 index 0000000000..aa263b6363 --- /dev/null +++ b/roles/gpu_operator/gpu_operator_extend_metrics/files/dcgm-metrics.csv @@ -0,0 +1,90 @@ +# Format +# If line starts with a '#' it is considered a comment +# DCGM FIELD, Prometheus metric type, help message + +# Clocks +DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). +DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz). + +# Temperature +DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C). +DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C). + +# Power +DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W). +DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ). + +# PCIE +# DCGM_FI_DEV_PCIE_TX_THROUGHPUT, counter, Total number of bytes transmitted through PCIe TX (in KB) via NVML. +# DCGM_FI_DEV_PCIE_RX_THROUGHPUT, counter, Total number of bytes received through PCIe RX (in KB) via NVML. +DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries. + +# Utilization (the sample period varies depending on the product) +DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %). +DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %). +DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %). +DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %). + +# Errors and violations +DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered. +# DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us). +# DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us). +# DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us). +# DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us). +# DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us). +# DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us). + +# Memory usage +DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB). +DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB). + +# ECC +# DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors. +# DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors. +# DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors. +# DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors. + +# Retired pages +# DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors. +# DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors. +# DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement. + +# NVLink +# DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors. +# DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. +# DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. +# DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. +DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes. +# DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, counter, The number of bytes of active NVLink rx or tx data including both header and payload. + +# VGPU License status +DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status + +# Remapped rows +DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors +DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors +DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed + +# Static configuration information. These appear as labels on the other metrics +DCGM_FI_DRIVER_VERSION, label, Driver Version +# DCGM_FI_NVML_VERSION, label, NVML Version +# DCGM_FI_DEV_BRAND, label, Device Brand +# DCGM_FI_DEV_SERIAL, label, Device Serial Number +# DCGM_FI_DEV_OEM_INFOROM_VER, label, OEM inforom version +# DCGM_FI_DEV_ECC_INFOROM_VER, label, ECC inforom version +# DCGM_FI_DEV_POWER_INFOROM_VER, label, Power management object inforom version +# DCGM_FI_DEV_INFOROM_IMAGE_VER, label, Inforom image version +# DCGM_FI_DEV_VBIOS_VERSION, label, VBIOS version of the device + +# DCP metrics +DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active (in %). +# DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned (in %). +# DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM (in %). +DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active (in %). +DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data (in %). +# DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active (in %). +# DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active (in %). +# DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active (in %). +DCGM_FI_PROF_PCIE_TX_BYTES, gauge, The rate of data transmitted over the PCIe bus - including both protocol headers and data payloads - in bytes per second. +DCGM_FI_PROF_PCIE_RX_BYTES, gauge, The rate of data received over the PCIe bus - including both protocol headers and data payloads - in bytes per second. + diff --git a/roles/gpu_operator/gpu_operator_extend_metrics/files/gpu.json b/roles/gpu_operator/gpu_operator_extend_metrics/files/gpu.json new file mode 100644 index 0000000000..4d579482f8 --- /dev/null +++ b/roles/gpu_operator/gpu_operator_extend_metrics/files/gpu.json @@ -0,0 +1,164 @@ +{ + "apiVersion": "nvidia.com/v1", + "kind": "ClusterPolicy", + "metadata": { + "creationTimestamp": "2023-10-24T19:28:52Z", + "generation": 17, + "name": "gpu-cluster-policy", + "resourceVersion": "66537431", + "uid": "444eb444-6cca-4df1-8390-1eb0d29666dd" + }, + "spec": { + "cdi": { + "default": false, + "enabled": false + }, + "daemonsets": { + "rollingUpdate": { + "maxUnavailable": "1" + }, + "updateStrategy": "RollingUpdate" + }, + "dcgm": { + "enabled": true + }, + "dcgmExporter": { + "config": { + "name": "dcgm-metrics-list" + }, + "enabled": true, + "serviceMonitor": { + "enabled": true + } + }, + "devicePlugin": { + "config": { + "default": "any", + "name": "time-slicing-config-all" + }, + "enabled": true + }, + "driver": { + "certConfig": { + "name": "" + }, + "enabled": true, + "kernelModuleConfig": { + "name": "" + }, + "licensingConfig": { + "configMapName": "", + "nlsEnabled": true + }, + "repoConfig": { + "configMapName": "" + }, + "upgradePolicy": { + "autoUpgrade": true, + "drain": { + "deleteEmptyDir": false, + "enable": false, + "force": false, + "timeoutSeconds": 300 + }, + "maxParallelUpgrades": 1, + "maxUnavailable": "25%", + "podDeletion": { + "deleteEmptyDir": false, + "force": false, + "timeoutSeconds": 300 + }, + "waitForCompletion": { + "timeoutSeconds": 0 + } + }, + "useNvidiaDriverCRD": false, + "virtualTopology": { + "config": "" + } + }, + "gds": { + "enabled": false + }, + "gfd": { + "enabled": true + }, + "kataManager": { + "config": { + "artifactsDir": "/opt/nvidia-gpu-operator/artifacts/runtimeclasses" + } + }, + "mig": { + "strategy": "single" + }, + "migManager": { + "config": { + "default": "all-disabled", + "name": "default-mig-parted-config" + }, + "enabled": true + }, + "nodeStatusExporter": { + "enabled": true + }, + "operator": { + "defaultRuntime": "crio", + "initContainer": {}, + "runtimeClass": "nvidia", + "use_ocp_driver_toolkit": true + }, + "sandboxDevicePlugin": { + "enabled": true + }, + "sandboxWorkloads": { + "defaultWorkload": "container", + "enabled": false + }, + "toolkit": { + "enabled": true, + "installDir": "/usr/local/nvidia" + }, + "validator": { + "plugin": { + "env": [ + { + "name": "WITH_WORKLOAD", + "value": "false" + } + ] + } + }, + "vfioManager": { + "enabled": true + }, + "vgpuDeviceManager": { + "config": { + "default": "default" + }, + "enabled": true + }, + "vgpuManager": { + "enabled": false + } + }, + "status": { + "conditions": [ + { + "lastTransitionTime": "2023-11-07T13:59:43Z", + "message": "ClusterPolicy is ready as all resources have been successfully reconciled", + "reason": "Reconciled", + "status": "True", + "type": "Ready" + }, + { + "lastTransitionTime": "2023-11-07T13:59:43Z", + "message": "", + "reason": "Ready", + "status": "False", + "type": "Error" + } + ], + "namespace": "nvidia-gpu-operator", + "state": "ready" + } +} diff --git a/roles/gpu_operator/gpu_operator_extend_metrics/files/well-known.csv b/roles/gpu_operator/gpu_operator_extend_metrics/files/well-known.csv new file mode 100644 index 0000000000..41fac09ffe --- /dev/null +++ b/roles/gpu_operator/gpu_operator_extend_metrics/files/well-known.csv @@ -0,0 +1,8 @@ +# DCP metrics | extra metrics disabled in the default DCGM metrics set + +DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned (in %). +DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM (in %). + +DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active (in %). +DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active (in %). +DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active (in %). diff --git a/roles/gpu_operator/gpu_operator_extend_metrics/meta/main.yml b/roles/gpu_operator/gpu_operator_extend_metrics/meta/main.yml new file mode 100644 index 0000000000..bd0638df3e --- /dev/null +++ b/roles/gpu_operator/gpu_operator_extend_metrics/meta/main.yml @@ -0,0 +1,3 @@ +--- +dependencies: + - role: check_deps diff --git a/roles/gpu_operator/gpu_operator_extend_metrics/tasks/main.yml b/roles/gpu_operator/gpu_operator_extend_metrics/tasks/main.yml new file mode 100644 index 0000000000..4175264c65 --- /dev/null +++ b/roles/gpu_operator/gpu_operator_extend_metrics/tasks/main.yml @@ -0,0 +1,113 @@ +- name: Create the src artifacts directory + file: + path: "{{ artifact_extra_logs_dir }}/src/" + state: directory + mode: '0755' + +- name: Create the src artifacts directory + file: + path: "{{ artifact_extra_logs_dir }}/artifacts/" + state: directory + mode: '0755' + +- name: Give the default metrics URL + debug: 'msg="Metrics defined upstream at https://raw.githubusercontent.com/NVIDIA/dcgm-exporter/main/etc/dcp-metrics-included.csv"' + when: gpu_operator_extend_metrics_include_defaults | bool + +- name: Prepare the metrics file + command: + touch "{{ artifact_extra_logs_dir }}/src/metrics.csv" + +- name: Include the default metrics + shell: | + cat <> "{{ artifact_extra_logs_dir }}/src/metrics.csv" + {{lookup('file', default_metrics_file)}} + EOF + when: gpu_operator_extend_metrics_include_defaults | bool + +- name: Add the well-known metrics + shell: | + cat <> "{{ artifact_extra_logs_dir }}/src/metrics.csv" + + # + # Well-known custom metrics + # + + {{lookup('file', well_known_metrics_file)}} + EOF + when: gpu_operator_extend_metrics_include_well_known | bool + +- name: Add the extra metrics + shell: | + cat <> "{{ artifact_extra_logs_dir }}/src/metrics.csv" + + # + # Extra custom metrics + # + + {% for metric in gpu_operator_extend_metrics_extra_metrics %} + {{ metric["name"] }}, {{ metric["type"] }}, {{ metric["description"] }} + {% endfor %} + EOF + when: gpu_operator_extend_metrics_extra_metrics is not none + +- name: Create the ConfigMap definition + shell: + oc create configmap {{ gpu_operator_extend_metrics_configmap_name }} + -n {{ gpu_operator_extend_metrics_namespace }} + --from-file="{{ artifact_extra_logs_dir }}/src/metrics.csv" + --dry-run=client + -oyaml + > "{{ artifact_extra_logs_dir }}/src/configmap.yaml" + +- name: Instanciate the ConfigMap + command: + oc apply -f "{{ artifact_extra_logs_dir }}/src/configmap.yaml" + +- name: Cleanup the ClusterPolicy (to force the update in the next task) + command: | + oc patch clusterpolicy/gpu-cluster-policy \ + --type merge \ + -p '{"spec": {"dcgmExporter": {"config": {"name": ""}}}}' + +- name: Update the ClusterPolicy + shell: | + set -o pipefail; + set -e; + + oc get clusterpolicy/gpu-cluster-policy -ojson > "{{ artifact_extra_logs_dir }}/artifacts/cluster_policy_old.json" + cat "{{ artifact_extra_logs_dir }}/artifacts/cluster_policy_old.json" \ + | jq '.spec.dcgmExporter.config.name = "{{ gpu_operator_extend_metrics_configmap_name }}" | .spec.dcgmExporter.config.env["DCGM_EXPORTER_COLLECTORS"] = "/etc/dcgm-exporter/dcgm-metrics.csv"' \ + | jq 'del(.status) | del(.metadata.resourceVersion) | del(.metadata.creationTimestamp) | del(.metadata.uid) | del(.metadata.generation)' \ + > "{{ artifact_extra_logs_dir }}/src/cluster_policy_new.json" + + oc apply -f "{{ artifact_extra_logs_dir }}/src/cluster_policy_new.json" + + +- name: Force bounce the GPU DCGM daemonsets + command: + oc delete daemonsets nvidia-dcgm-exporter + -n {{ gpu_operator_extend_metrics_namespace }} + --ignore-not-found + +- name: Wait for the DaemonSets and capture artifacts + block: + - name: Wait for the DaemonSets to be all available + shell: + set -o pipefail; + oc get daemonsets + -o=jsonpath="{range .items[*]}{.metadata.name}{' ='}{.status.numberUnavailable}{'=\n'}{end}" + -n {{ gpu_operator_extend_metrics_namespace }} + | grep -v "==" || true + register: daemonsets_not_ready + until: not daemonsets_not_ready.stdout + retries: 60 + delay: 10 + failed_when: daemonsets_not_ready.stdout | length > 0 + + always: + - name: Capture the ClusterPolicy + shell: + oc get clusterpolicy/gpu-cluster-policy + -ojson + > "{{ artifact_extra_logs_dir }}/artifacts/cluster_policy_new.json" diff --git a/roles/gpu_operator/gpu_operator_extend_metrics/templates/configmap.yaml.j2 b/roles/gpu_operator/gpu_operator_extend_metrics/templates/configmap.yaml.j2 new file mode 100644 index 0000000000..2b6eccd255 --- /dev/null +++ b/roles/gpu_operator/gpu_operator_extend_metrics/templates/configmap.yaml.j2 @@ -0,0 +1,16 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ gpu_operator_enable_time_sharing_configmap_name }} + namespace: {{ gpu_operator_enable_time_sharing_namespace }} +data: + any: |- + version: v1 + flags: + migStrategy: none + sharing: + timeSlicing: + resources: + - name: nvidia.com/gpu + replicas: {{ gpu_operator_enable_time_sharing_replicas }} diff --git a/roles/gpu_operator/gpu_operator_extend_metrics/vars/main/resources.yml b/roles/gpu_operator/gpu_operator_extend_metrics/vars/main/resources.yml new file mode 100644 index 0000000000..b2c233f3d0 --- /dev/null +++ b/roles/gpu_operator/gpu_operator_extend_metrics/vars/main/resources.yml @@ -0,0 +1,2 @@ +default_metrics_file: files/dcgm-metrics.csv +well_known_metrics_file: files/well-known.csv diff --git a/toolbox/gpu_operator.py b/toolbox/gpu_operator.py index 11f420ade0..b0ef8ca34a 100644 --- a/toolbox/gpu_operator.py +++ b/toolbox/gpu_operator.py @@ -165,3 +165,26 @@ def enable_time_sharing(self, replicas, namespace="nvidia-gpu-operator", configm """ return RunAnsibleRole(locals()) + + + @AnsibleRole("gpu_operator_extend_metrics") + @AnsibleMappedParams + def extend_metrics(self, + include_defaults=True, + include_well_known=False, + namespace="nvidia-gpu-operator", + configmap_name="metrics-config", + extra_metrics : list = None, + ): + """ + Enable time-sharing in the GPU Operator ClusterPolicy + + Args: + namespace: namespace in which the GPU Operator is deployed + configmap_name: name of the ConfigMap where the configuration will be stored + include_defaults: if True, include the default DCGM metrics in the custom config + include_well_known: if True, include well-known interesting DCGM metrics in the custom config + extra_metrics: if not None, a [{name,type,description}*] list of dictionnaries with the extra metrics to include in the custom config + """ + + return RunAnsibleRole(locals())